Preliminary support for open directory rdf files added, a=chris

Chris Pollett [2011-01-25 08:Jan:th]

Preliminary support for open directory rdf files added, a=chris

Filename
configs/config.php
lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
lib/processors/html_processor.php
lib/processors/rss_processor.php
lib/processors/text_processor.php

diff --git a/configs/config.php b/configs/config.php
index 2bad1a1cd..75ceb962c 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -286,6 +286,10 @@ $PAGE_PROCESSORS = array(   "text/html" => "HtmlProcessor",
 /** Characters we view as not part of words, not same as POSIX [:punct:]*/
 define ('PUNCT', "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\||\&");

+/** Percentage ASCII text before guess we dealing with english*/
+define ('EN_RATIO', 0.9);
+
+
 /**
  * How many non robot urls the fetcher successfully downloads before
  * between times data sent back to queue server
diff --git a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
index 57779a469..a487cde3f 100644
--- a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
@@ -94,7 +94,7 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
      */
     var $header;
     /**
-     *  File handle for current arc file
+     *  File handle for current mediawiki file
      *  @var resource
      */
     var $fh;
@@ -227,14 +227,17 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
      *
      * @param object $dom DOMDocument to get the text from
      * @param $path xpath expression to find node with text
+     *
+     * @return string text content of the given node if it exists
      */
     function getTextContent($dom, $path)
     {
         $xpath = new DOMXPath($dom);
         $objects = $xpath->evaluate($path);
-        if($objects  && is_object($objects)) {
+        if($objects  && is_object($objects) && $objects->item(0) != NULL ) {
             return $objects->item(0)->textContent;
         }
+        return "";
     }

     /**
diff --git a/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php b/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
new file mode 100644
index 000000000..a1be59e80
--- /dev/null
+++ b/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
@@ -0,0 +1,461 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010, 2011  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage iterator
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010, 2011
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ *Loads base class for iterating
+ */
+require_once BASE_DIR.
+    '/lib/archive_bundle_iterators/archive_bundle_iterator.php';
+
+/**
+ * Used to iterate through the records of a collection of one or more open
+ * directory RDF files stored in a WebArchiveBundle folder. Open Directory
+ * file can be found at http://rdf.dmoz.org/ .  Iteration would be
+ * for the purpose making an index of these records
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage iterator
+ * @see WebArchiveBundle
+ */
+class OdpRdfArchiveBundleIterator implements CrawlConstants
+{
+    /**
+     * The number of arc files in this arc archive bundle
+     *  @var int
+     */
+    var $num_partitions;
+
+    /**
+     *  Counting in glob order for this arc archive bundle directory, the
+     *  current active file number of the arc file being process.
+     *
+     *  @var int
+     */
+    var $current_partition_num;
+    /**
+        current number of wiki pages into the Media Wiki xml.bz2 file
+     *  @var int
+     */
+    var $current_page_num;
+    /**
+     *  Array of filenames of arc files in this directory (glob order)
+     *  @var array
+     */
+    var $partitions;
+    /**
+     *  Used to buffer data from the currently opened odp rdf file
+     *  @var string
+     */
+    var $buffer;
+    /**
+     *  Associative array containing global properties like base url of th
+     *  current open odp rdf file
+     *  @var array
+     */
+    var $header;
+    /**
+     *  File handle for current odp rdf file
+     *  @var resource
+     */
+    var $fh;
+    /**
+     * How many bytes to read into buffer from bz2 stream in one go
+     */
+    const BLOCK_SIZE = 8192;
+    /**
+     * Creates an open directory rdf archive iterator with the given parameters.
+     *
+     * @param string $iterate_timestamp timestamp of the arc archive bundle to
+     *      iterate  over the pages of
+     * @param string $result_timestamp timestamp of the arc archive bundle
+     *      results are being stored in
+     */
+    function __construct($iterate_timestamp, $result_timestamp)
+    {
+        $this->iterate_timestamp = $iterate_timestamp;
+        $this->result_timestamp = $result_timestamp;
+        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
+            $iterate_timestamp;
+        $this->partitions = array();
+        foreach(glob("$archive_name/*.gz") as $filename) {
+            $this->partitions[] = $filename;
+        }
+        $this->num_partitions = count($this->partitions);
+        $this->header['base_address'] = "http://www.dmoz.org/";
+        $url_parts = @parse_url($this->header['base_address']);
+        $this->header['ip_address'] = gethostbyname($url_parts['host']);
+
+        if(file_exists("$archive_name/iterate_status.txt")) {
+            $info = unserialize(file_get_contents(
+                "$archive_name/iterate_status.txt"));
+            $this->end_of_iterator = $info['end_of_iterator'];
+            $this->current_partition_num = $info['current_partition_num'];
+            $this->current_offset = $info['current_offset'];
+
+            $this->fh=gzopen(
+                $this->partitions[$this->current_partition_num], "r");
+            $this->buffer = "";
+            $this->readPages($this->current_page_num, false);
+        } else {
+            $this->reset();
+        }
+    }
+
+    /**
+     * Used to extract data between two tags for the first tag found
+     * amongst the array of tags $tags. After operation $this->buffer has
+     * contents after the close tag.
+     *
+     * @param array $tags array of tagnames to look for
+     *
+     * @return string data start tag contents close tag of first tag found
+     */
+    function getNextTagsData($tags)
+    {
+        do {
+            $done = false;
+            if(!$this->fh || feof($this->fh)) {return false;}
+            $this->buffer .= gzread($this->fh, self::BLOCK_SIZE);
+
+            foreach($tags as $tag) {
+                if(stristr($this->buffer, "</$tag")) {
+                    $done = true;
+                }
+            }
+        } while(!$done);
+        $found_tag = "";
+        $min_pos_tag = strlen($this->buffer);
+        foreach($tags as $tag) {
+            $pos_tag = strpos($this->buffer, $tag);
+            if( $pos_tag !== false) {
+                if($found_tag == "" || $pos_tag < $min_pos_tag) {
+                    $found_tag = $tag;
+                    $min_pos_tag = $pos_tag;
+                }
+            }
+        }
+        $start_info = strpos($this->buffer, "<$found_tag");
+        $pre_end_info = strpos($this->buffer, "</$found_tag", $start_info);
+        $end_info = strpos($this->buffer, ">", $pre_end_info) + 1;
+        $tag_info = substr($this->buffer, $start_info,
+            $end_info - $start_info);
+        $this->buffer = substr($this->buffer, $end_info);
+        return array($tag_info, $found_tag);
+    }
+
+    /**
+     * Gets the text content of the first dom node satisfying the
+     * xpath expression $path in the dom document $dom
+     *
+     * @param object $dom DOMDocument to get the text from
+     * @param $path xpath expression to find node with text
+     *
+     * @return string text content of the given node if it exists
+     */
+    function getTextContent($dom, $path)
+    {
+        $xpath = new DOMXPath($dom);
+        $objects = $xpath->evaluate($path);
+        if($objects  && is_object($objects) && $objects->item(0) != NULL) {
+            return $objects->item(0)->textContent;
+        }
+        return "";
+    }
+
+    /**
+     * Gets the value of the attribute $attribute for each dom node
+     * satisfying the xpath expression $path in the dom document $dom
+     *
+     * @param object $dom DOMDocument to get the text from
+     * @param $path xpath expression to find node with text
+     * @param string $attribute name of the attribute to get the values for
+     *
+     * @return array of values of the given attribute
+     */
+    function getAttributeValueAll($dom, $path, $attribute)
+    {
+        $values = array();
+        $xpath = new DOMXPath($dom);
+        $objects = $xpath->evaluate($path);
+        if($objects  && is_object($objects)) {
+            foreach($objects as $object) {
+                $value = $object->getAttribute($attribute);
+                if($value) {
+                    $values[] = $value;
+                }
+            }
+        }
+        return $values;
+    }
+
+    /**
+     * Gets the value of the attribute $attribute of the first dom node
+     * satisfying the xpath expression $path in the dom document $dom
+     *
+     * @param object $dom DOMDocument to get the text from
+     * @param $path xpath expression to find node with text
+     * @param string $attribute name of the attribute to get the value for
+     *
+     * @return string value of the given attribute
+     */
+    function getAttributeValue($dom, $path,  $attribute)
+    {
+        $xpath = new DOMXPath($dom);
+        $objects = $xpath->evaluate($path);
+        if($objects  && is_object($objects) && $objects->item(0) != NULL) {
+            return $objects->item(0)->getAttribute($attribute);
+        }
+        return "";
+    }
+
+    /**
+     * Resets the iterator to the start of the archive bundle
+     */
+    function reset()
+    {
+        $this->current_partition_num = -1;
+        $this->end_of_iterator = false;
+        $this->current_offset = 0;
+        $this->fh = NULL;
+        $this->buffer = "";
+        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
+            $this->result_timestamp;
+        @unlink("$archive_name/iterate_status.txt");
+    }
+
+    /**
+     * Gets the next $num many Topic or ExternalPage pages from the iterator
+     * @param int $num number of docs to get
+     * @return array associative arrays of data for $num pages
+     */
+    function nextPages($num)
+    {
+        return $this->readPages($num, true);
+    }
+
+    /**
+     * Reads the next at most $num many wiki pages from the iterator. It might
+     * return less than $num many documents if the partition changes or the end
+     * of the bundle is reached.
+     *
+     * @param int $num number of pages to get
+     * @param bool $return_pages whether to return all of the pages or
+     *      not. If not, then doesn't bother storing them
+     * @return array associative arrays for $num pages
+     */
+    function readPages($num, $return_pages)
+    {
+        $pages = array();
+        $page_count = 0;
+        for($i = 0; $i < $num; $i++) {
+            $page = $this->readPage($return_pages);
+            if(!$page) {
+                if(is_resource($this->fh)) {
+                    gzclose($this->fh);
+                }
+                $this->current_partition_num++;
+                if($this->current_partition_num >= $this->num_partitions) {
+                    $this->end_of_iterator = true;
+                    break;
+                }
+                $this->fh = gzopen(
+                    $this->partitions[$this->current_partition_num], "r");
+            } else {
+                if($return_pages) {
+                    $pages[] = $page;
+                }
+                $page_count++;
+            }
+        }
+        if(is_resource($this->fh)) {
+            $this->current_page_num += $page_count;
+        }
+
+        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
+            $this->result_timestamp;
+        $info = array();
+        $info['end_of_iterator'] = $this->end_of_iterator;
+        $info['current_partition_num'] = $this->current_partition_num;
+        $info['current_page_num'] = $this->current_page_num;
+        file_put_contents("$archive_name/iterate_status.txt",
+            serialize($info));
+        return $pages;
+    }
+
+
+    /**
+     * Gets the next doc from the iterator
+     * @return array associative array for doc
+     */
+    function readPage($return_page)
+    {
+        if(!is_resource($this->fh)) return NULL;
+        list($page_info, $tag) = $this->getNextTagsData(
+            array("Topic","ExternalPage"));
+        if(!$return_page) {
+            return true;
+        }
+        $page_info = str_replace("r:id","id", $page_info);
+        $page_info = str_replace("r:resource","resource", $page_info);
+        $page_info = str_replace("d:Title","Title", $page_info);
+        $page_info = str_replace("d:Description","Description", $page_info);
+        $dom = new DOMDocument();
+        $dom->loadXML($page_info);
+        $processMethod = "process".$tag;
+        $site[self::IP_ADDRESSES] = array($this->header['ip_address']);
+        $site[self::MODIFIED] = time();
+        $site[self::TIMESTAMP] = time();
+        $site[self::TYPE] = "text/html";
+        $site[self::HEADER] = "odp_rdf_bundle_iterator extractor";
+        $site[self::HTTP_CODE] = 200;
+        $site[self::ENCODING] = "UTF-8";
+        $site[self::SERVER] = "unknown";
+        $site[self::SERVER_VERSION] = "unknown";
+        $site[self::OPERATING_SYSTEM] = "unknown";
+        $site[self::WEIGHT] = 1;
+        $this->$processMethod($dom, $site);
+
+        $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
+
+        return $site;
+    }
+
+    /**
+     *  Computes an HTML page for a Topic tag parsed from the ODP RDF
+     *  document
+     *
+     *  @param object $dom document object for one Topic tag tag
+     *  @param array &$site a reference to an array of header and page info
+     *      for an html page
+     */
+    function processTopic($dom, &$site)
+    {
+        $topic_path = $this->getAttributeValue($dom, "/Topic", "id");
+        $site[self::URL] = $this->header['base_address'].$topic_path;
+
+        $title = str_replace("/", " ", $topic_path);
+        $links = $this->computeTopicLinks($topic_path);
+
+        $topic_link1 = $this->getAttributeValue($dom, "/Topic/link1",
+            "resource");
+        if($topic_link1) {
+            $links[$topic_link1] = $topic_link1." - ".$title;
+        }
+
+        $topic_links = $this->getAttributeValueAll($dom, "/Topic/link",
+            "resource");
+        if($topic_links != NULL) {
+            foreach($topic_links as $topic_link) {
+                $links[$topic_link] = $topic_link." - ".$title;
+            }
+        }
+        $site[self::PAGE] = "<html>\n".
+            "<head><title>$title</title></head>\n"
+            ."<body><h1>$title</h1>\n";
+        $site[self::PAGE] .= $this->linksToHtml($links);
+        $site[self::PAGE] .= "</body></html>";
+
+    }
+
+    /**
+     *  Computes an HTML page for an ExternalPage tag parsed from the ODP RDF
+     *  document
+     *
+     *  @param object $dom document object for one Topic tag tag
+     *  @param array &$site a reference to an array of header and page info
+     *      for an html page
+     */
+    function processExternalPage($dom, &$site)
+    {
+        $site[self::URL] = $this->getAttributeValue($dom,
+            "/ExternalPage", "about");
+
+        $topic_path = $this->getTextContent($dom, "/ExternalPage/topic");
+
+        $links = $this->computeTopicLinks($topic_path);
+        $title = $this->getTextContent($dom, "/ExternalPage/Title");
+        $title = "$title - ".str_replace("/", " ", $topic_path);
+        $description = $this->getTextContent(
+            $dom, "/ExternalPage/Description");
+
+        $site[self::PAGE] = "<html>\n".
+            "<head><title>$title</title></head>\n"
+            ."<body><h1>$title</h1>\n";
+        $site[self::PAGE] .= $this->linksToHtml($links);
+        $site[self::PAGE] .= "<div>$description</div></body></html>";
+    }
+
+    /**
+     *  Computes links for prefix topics of an ODP topic path
+     *
+     *  @param string $topic_path to compute links for
+     *  @return array url => text pairs for each prefix of path
+     */
+    function computeTopicLinks($topic_path)
+    {
+        $links = array();
+        $topic_parts = explode("/", $topic_path);
+        $path = "";
+
+        foreach($topic_parts as $part){
+            $path .= "/$part";
+            $links[$this->header['base_address'].$path] = $part;
+        }
+        return $links;
+    }
+
+    /**
+     *  Makes an unordered HTML list out of an associative array of
+     *  url => link_text pairs.
+     *
+     *  @param array $links url=>link_text pairs
+     *  @return string containing html for unorderlisted list of links
+     */
+    function linksToHtml($links)
+    {
+        $html = "";
+        if(count($links) > 0) {
+            $html .= "<ul>\n";
+            foreach($links as $url => $text) {
+                $html .= '<li><a href="'.
+                    $url.'">'.$text.'</a></li>';
+            }
+            $html .= "</ul>\n";
+        }
+        return $html;
+    }
+}
+?>
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index 65e444193..b32ab5a42 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -160,7 +160,7 @@ class HtmlProcessor extends TextProcessor
                 }
             }
             // crude, but let's guess ASCII == english
-            if($ascii_count/$num_words > 0.9) {
+            if($ascii_count/$num_words > EN_RATIO) {
                 $lang = 'en';
             } else {
                 $lang = NULL;
diff --git a/lib/processors/rss_processor.php b/lib/processors/rss_processor.php
index 78f3d7622..42018e530 100644
--- a/lib/processors/rss_processor.php
+++ b/lib/processors/rss_processor.php
@@ -114,7 +114,7 @@ class RssProcessor extends TextProcessor
                 }
             }
             // crude, but let's guess ASCII == english
-            if($ascii_count/$num_words > 0.9) {
+            if($ascii_count/$num_words > EN_RATIO) {
                 $lang = 'en';
             } else {
                 $lang = NULL;
diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index a835e0f91..f8ccbcb93 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -96,7 +96,7 @@ class TextProcessor implements CrawlConstants
                 }
             }
             // crude, but let's guess ASCII == english
-            if($ascii_count/$num_words > 0.9) {
+            if($ascii_count/$num_words > EN_RATIO) {
                 $lang = 'en';
             } else {
                 $lang = NULL;

ViewGit