Source for file odp_rdf_bundle_iterator.php
Documentation is available at odp_rdf_bundle_iterator.php
* Open Source Pure PHP Search Engine, Crawler, and Indexer
* Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* @author Chris Pollett chris@pollett.org
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
*Loads base class for iterating
'/lib/archive_bundle_iterators/text_archive_bundle_iterator.php';
* Used to iterate through the records of a collection of one or more open
* directory RDF files stored in a WebArchiveBundle folder. Open Directory
* file can be found at http://rdf.dmoz.org/ . Iteration would be
* for the purpose making an index of these records
* Associative array containing global properties like base url of the
* current open odp rdf file
* How many bytes to read into buffer from gzip stream in one go
* Creates an open directory rdf archive iterator with the given parameters.
* @param string $iterate_timestamp timestamp of the arc archive bundle to
* iterate over the pages of
* @param string $iterate_dir folder of files to iterate over
* @param string $result_timestamp timestamp of the arc archive bundle
* results are being stored in
* @param string $result_dir where to write last position checkpoints to
$result_timestamp, $result_dir)
$ini = array( 'compression' => 'gzip',
'file_extension' => 'gz',
'start_delimiter' => '@Topic|ExternalPage@',
'end_delimiter' => '@/Topic|/ExternalPage@');
$result_timestamp, $result_dir, $ini);
$this->header['base_address'] = "http://www.dmoz.org/";
* Estimates the important of the site according to the weighting of
* the particular archive iterator
* @param $site an associative array containing info about a web page
* @return int a 4-bit number based on the topic path of the odp entry
* (@see processTopic @see processExternalPage)
return min($site[self::WEIGHT], 15);
* Gets the text content of the first dom node satisfying the
* xpath expression $path in the dom document $dom
* @param object $dom DOMDocument to get the text from
* @param $path xpath expression to find node with text
* @return string text content of the given node if it exists
$xpath = new DOMXPath($dom);
$objects = $xpath->evaluate($path);
if($objects && is_object($objects) && $objects->item(0) != NULL) {
return $objects->item(0)->textContent;
* Gets the value of the attribute $attribute for each dom node
* satisfying the xpath expression $path in the dom document $dom
* @param object $dom DOMDocument to get the text from
* @param $path xpath expression to find node with text
* @param string $attribute name of the attribute to get the values for
* @return array of values of the given attribute
$xpath = new DOMXPath($dom);
$objects = $xpath->evaluate($path);
foreach($objects as $object) {
$value = $object->getAttribute($attribute);
* Gets the value of the attribute $attribute of the first dom node
* satisfying the xpath expression $path in the dom document $dom
* @param object $dom DOMDocument to get the text from
* @param $path xpath expression to find node with text
* @param string $attribute name of the attribute to get the value for
* @return string value of the given attribute
$xpath = new DOMXPath($dom);
$objects = $xpath->evaluate($path);
if($objects && is_object($objects) && $objects->item(0) != NULL) {
return $objects->item(0)->getAttribute($attribute);
* Gets the next doc from the iterator
* @param bool $no_process do not do any processing on page data
* @return array associative array for doc or string if no_process true
array("Topic","ExternalPage"));
list ($page_info, $tag) = $tag_data;
if($no_process) { return $page_info; }
$page_info = str_replace("r:resource","resource", $page_info);
$page_info = str_replace("d:Title","Title", $page_info);
$page_info = str_replace("d:Description","Description", $page_info);
$dom = new DOMDocument();
$dom->loadXML($page_info);
$processMethod = "process". $tag;
$site[self::IP_ADDRESSES] = array($this->header['ip_address']);
$site[self::MODIFIED] = time();
$site[self::TIMESTAMP] = time();
$site[self::TYPE] = "text/html";
$site[self::HEADER] = "odp_rdf_bundle_iterator extractor";
$site[self::HTTP_CODE] = 200;
$site[self::ENCODING] = "UTF-8";
$site[self::SERVER] = "unknown";
$site[self::SERVER_VERSION] = "unknown";
$site[self::OPERATING_SYSTEM] = "unknown";
$this->$processMethod($dom, $site);
* Computes an HTML page for a Topic tag parsed from the ODP RDF
* @param object $dom document object for one Topic tag tag
* @param array &$site a reference to an array of header and page info
$site[self::URL] = $this->header['base_address']. $topic_path;
$links[$topic_link1] = $topic_link1. " - ". $title;
if($topic_links != NULL) {
foreach($topic_links as $topic_link) {
$links[$topic_link] = $topic_link. " - ". $title;
$site[self::PAGE] = "<html>\n".
"<head><title>$title</title></head>\n"
. "<body><h1>$title</h1>\n";
$site[self::PAGE] .= "</body></html>";
* Computes an HTML page for an ExternalPage tag parsed from the ODP RDF
* @param object $dom document object for one Topic tag tag
* @param array &$site a reference to an array of header and page info
"/ExternalPage", "about");
$title = "$title - ". str_replace("/", " ", $topic_path);
$dom, "/ExternalPage/Description");
$site[self::PAGE] = "<html>\n".
"<head><title>$title</title></head>\n"
. "<body><h1>$title</h1>\n";
$site[self::PAGE] .= "<div>$description</div></body></html>";
* Computes links for prefix topics of an ODP topic path
* @param string $topic_path to compute links for
* @return array url => text pairs for each prefix of path
$topic_parts = explode("/", $topic_path);
foreach($topic_parts as $part){
$links[$this->header['base_address']. $path] = $part;
* Makes an unordered HTML list out of an associative array of
* url => link_text pairs.
* @param array $links url=>link_text pairs
* @return string containing html for unorderlisted list of links
foreach($links as $url => $text) {
$html .= '<li><a href="'.
$url. '">'. $text. '</a></li>';
|