Adds a command line tool for IndexArchiveBundles and WebArchiveBundles (Issue 23), a=chris

Chris Pollett [2010-12-06 13:Dec:th]

Adds a command line tool for IndexArchiveBundles and WebArchiveBundles (Issue 23), a=chris

Filename
bin/arc_tool.php
bin/queue_server.php
controllers/admin_controller.php
lib/processors/bmp_processor.php
lib/processors/rss_processor.php
lib/processors/sitemap_processor.php
lib/processors/svg_processor.php
lib/processors/xml_processor.php

diff --git a/bin/arc_tool.php b/bin/arc_tool.php
new file mode 100644
index 000000000..4e7c6be9b
--- /dev/null
+++ b/bin/arc_tool.php
@@ -0,0 +1,350 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage bin
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+/** Calculate base directory of script */
+define("BASE_DIR", substr(
+    dirname(realpath($_SERVER['PHP_SELF'])), 0,
+    -strlen("/bin")));
+
+/** Load in global configuration settings */
+require_once BASE_DIR.'/configs/config.php';
+if(!PROFILE) {
+    echo "Please configure the search engine instance by visiting" .
+        "its web interface on localhost.\n";
+    exit();
+}
+
+/** Load the class that maintains our URL queue */
+require_once BASE_DIR."/lib/web_queue_bundle.php";
+
+/** Load word->{array of docs with word} index class */
+require_once BASE_DIR."/lib/index_archive_bundle.php";
+
+/** Used for manipulating urls*/
+require_once BASE_DIR."/lib/url_parser.php";
+
+/**  For crawlHash function */
+require_once BASE_DIR."/lib/utility.php";
+
+/** Loads common constants for web crawling*/
+require_once BASE_DIR."/lib/crawl_constants.php";
+
+/*
+ *  We'll set up multi-byte string handling to use UTF-8
+ */
+mb_internal_encoding("UTF-8");
+mb_regex_encoding("UTF-8");
+
+/**
+ * Command line program that allows one to examine the content of
+ * the WebArchiveBundles and IndexArchiveBundles of Yioop crawls.
+ * For now it supports returning header information about bundles,
+ * as well as pretty printing the page/summary contents of the bundle.
+ *
+ * The former can be gotten from a bundle by running arc_tool with a
+ * command like:
+ * php arc_tool.php info bundle_name
+ *
+ * The latter can be gotten from a bundle by running arc_tool with a
+ * command like:
+ * php arc_tool.php list bundle_name start_doc_num num_results
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ */
+class ArcTool implements CrawlConstants
+{
+
+    /**
+     * The maximum number of documents the arc_tool list function
+     * will read into memory in one go.
+     */
+    const MAX_BUFFER_DOCS = 200;
+
+    /**
+     * Initializes the ArcTool, for now does nothing
+     */
+    function __construct()
+    {
+
+    }
+
+    /**
+     * Runs the ArcTool on the supplied command line arguments
+     */
+    function start()
+    {
+        global $argv;
+
+        if(isset($_SERVER['DOCUMENT_ROOT']) &&
+            strlen($_SERVER['DOCUMENT_ROOT']) > 0) {
+            echo "BAD REQUEST";
+            exit();
+        }
+
+        if(!isset($argv[1])) {
+            usageMessageAndExit();
+        }
+
+        switch($argv[1])
+        {
+            case "info":
+                if(!isset($argv[2]) ) {
+                    $tis->usageMessageAndExit();
+                }
+                $this->outputInfo($argv[2]);
+            break;
+
+            case "list":
+                if(!isset($argv[2]) || !isset($argv[3])) {
+                    $this->usageMessageAndExit();
+                }
+                $this->outputList($argv[2], $argv[3], $argv[4]);
+            break;
+
+            default:
+                $this->usageMessageAndExit();
+        }
+
+    }
+
+    /**
+     * Determines whether the supplied name is a WebArchiveBundle or
+     * an IndexArchiveBundle. Then outputsto stdout header information about the
+     * bundle by calling the appropriate sub-function.
+     *
+     * @param string $archive_name the name of a directory that holds
+     *      WebArchiveBundle or IndexArchiveBundle data
+     */
+    function outputInfo($archive_name)
+    {
+        $bundle_name = UrlParser::getDocumentFilename($archive_name);
+        echo "Bundle Name: ".$bundle_name."\n";
+        $archive_type = $this->getArchiveKind($archive_name);
+        echo "Bundle Type: ".$archive_type."\n";
+        if($archive_type === false) {
+            $this->badFormatMessageAndExit($archive_name);
+        }
+        $call = "outputInfo".$archive_type;
+        $info = $archive_type::getArchiveInfo($archive_name);
+        $this->$call($info, $archive_name);
+    }
+
+    /**
+     * Outputs to stdout header information for a IndexArchiveBundle
+     * bundle.
+     *
+     * @param array $info header info that has already been read from
+     *      the description.txt file
+     * @param string $archive_name the name of the folder containing the bundle
+     */
+    function outputInfoIndexArchiveBundle($info, $archive_name)
+    {
+        $more_info = unserialize($info['DESCRIPTION']);
+        unset($info['DESCRIPTION']);
+        $info = array_merge($info, $more_info);
+        echo "Description: ".$info['DESCRIPTION']."\n";
+        $generation_info = unserialize(
+            file_get_contents("$archive_name/generation.txt"));
+        $num_generations = $generation_info['ACTIVE']+1;
+        echo "Number of generations: ".$num_generations."\n";
+        echo "Number of stored links and documents: ".$info['COUNT']."\n";
+        echo "Number of stored documents: ".$info['VISITED_URLS_COUNT']."\n";
+        $crawl_order = ($info[self::CRAWL_ORDER] == self::BREADTH_FIRST) ?
+            "Bread First" : "Page Importance";
+        echo "Crawl order was: $crawl_order\n";
+        echo "Seed sites:\n";
+        foreach($info[self::TO_CRAWL] as $seed) {
+            echo "   $seed\n";
+        }
+        if($info[self::RESTRICT_SITES_BY_URL]) {
+            echo "Sites allowed to crawl:\n";
+            foreach($info[self::ALLOWED_SITES] as $site) {
+                echo "   $site\n";
+            }
+        }
+        echo "Sites not allowed to be crawled:\n";
+        foreach($info[self::DISALLOWED_SITES] as $site) {
+            echo "   $site\n";
+        }
+        echo "Meta Words:\n";
+        foreach($info[self::META_WORDS] as $word) {
+            echo "   $word\n";
+        }
+        echo "\n";
+    }
+
+    /**
+     * Outputs to stdout header information for a WebArchiveBundle
+     * bundle.
+     *
+     * @param array $info header info that has already been read from
+     *      the description.txt file
+     * @param string $archive_name the name of the folder containing the bundle
+
+     */
+    function outputInfoWebArchiveBundle($info, $archive_name)
+    {
+        echo "Description: ".$info['DESCRIPTION']."\n";
+        echo "Number of stored documents: ".$info['COUNT']."\n";
+        echo "Maximum Number of documents per partition: ".
+            $info['NUM_DOCS_PER_PARTITION']."\n";
+        echo "Number of partitions: ".
+            ($info['WRITE_PARTITION']+1)."\n";
+        echo "\n";
+    }
+
+    /**
+     * Used to list out the pages/summaries stored in a bundle
+     * $archive_name. It lists to stdout $num many documents starting at $start.
+     *
+     * @param string $archive_name name of bundle to list documents for
+     * @param int $start first document to list
+     * @param int $num number of documents to list
+     */
+    function outputList($archive_name, $start, $num)
+    {
+        $fields_to_print = array(
+            self::URL => "URL",
+            self::HTTP_CODE => "HTTP RESPONSE CODE",
+            self::TYPE => "MIMETYPE",
+            self::ENCODING => "CHARACTER ENCODING",
+            self::DESCRIPTION => "DESCRIPTION",
+            self::PAGE => "PAGE DATA");
+        $archive_type = $this->getArchiveKind($archive_name);
+        if($archive_type === false) {
+            $this->badFormatMessageAndExit($archive_name);
+        }
+        $info = $archive_type::getArchiveInfo($archive_name);
+        $num = min($num, $info["COUNT"] - $start);
+
+        if($archive_type == "IndexArchiveBundle") {
+            $generation_info = unserialize(
+                file_get_contents("$archive_name/generation.txt"));
+            $num_generations = $generation_info['ACTIVE']+1;
+            $archive = new WebArchiveBundle($archive_name."/summaries");
+        } else {
+            $num_generations = $info["WRITE_PARTITION"]+1;
+            $archive = new WebArchiveBundle($archive_name);
+        }
+        $num = max($num, 0);
+        $total = $start + $num;
+        $seen = 0;
+        $generation = 0;
+        while($seen < $total && $generation < $num_generations) {
+            $partition = $archive->getPartition($generation, false);
+            if($archive->count < $start && $seen < $start) {
+                $generation++;
+                $seen += $this->count;
+                continue;
+            }
+            $seen_generation = 0;
+            while($seen < $total && $seen_generation < $archive->count) {
+                $num_to_get = min($total - $seen,
+                    $archive->count - $seen_generation,
+                    self::MAX_BUFFER_DOCS);
+                $objects = $partition->nextObjects($num_to_get);
+                $seen += $num_to_get;
+                $seen_generation += $num_to_get;
+                if($seen > $start) {
+                    $num_to_show = min($seen - $start, $num_to_get);
+                    $cnt = 0;
+                    $first = $num_to_get - $num_to_show;
+                    foreach($objects as $object) {
+                        if($cnt >= $first) {
+                            $out = "";
+                            foreach($fields_to_print as $key => $name) {
+                                if(isset($object[1][$key])) {
+                                    $out .= "[$name]\n";
+                                    $out .= $object[1][$key]."\n";
+                                }
+                            }
+                            $out .= "==========\n\n";
+                            echo "BEGIN ITEM, LENGTH:".strlen($out)."\n";
+                            echo $out;
+                        }
+                        $cnt++;
+                    }
+                }
+            }
+            $generation++;
+        }
+    }
+
+    /**
+     * Given a folder name, determines the kind of bundle (if any) it holds.
+     * It does this based on the expected location of the description.txt file.
+     *
+     * @param string $archive_name the name of folder
+     * @return string the archive bundle type, either: WebArchiveBundle or
+     *      IndexArchiveBundle
+     */
+    function getArchiveKind($archive_name)
+    {
+        if(file_exists("$archive_name/description.txt")) {
+            return "WebArchiveBundle";
+        }
+        if(file_exists("$archive_name/summaries/description.txt")) {
+            return "IndexArchiveBundle";
+        }
+        return false;
+    }
+
+    /**
+     * Outputs the "hey, this isn't a known bundle message" and then exit()'s.
+     */
+    function badFormatMessageAndExit($archive_name)
+    {
+        echo "$archive_name does not appear to be a web or index ".
+        "archive bundle\n";
+        exit();
+    }
+
+    /**
+     * Outputs the "how to use this tool message" and then exit()'s.
+     */
+    function usageMessageAndExit()
+    {
+        echo "arc_tool is used to look at the contents of";
+        echo " WebArchiveBundles and IndexArchiveBundles.\n For example,\n";
+        echo "php arc_tool.php info bundle_name //return info about ".
+            "documents stored in archive.\n";
+        echo "php arc_tool.php list bundle_name start num //outputs".
+            " items start through num.from bundle_name\n";
+        exit();
+    }
+}
+
+$arc_tool =  new ArcTool();
+$arc_tool->start();
+
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 5f8ae33cb..9b39cc87f 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -164,7 +164,7 @@ class QueueServer implements CrawlConstants
      */
     var $last_index_save_time;
     /**
-     * flasg for whether the index has data to be written to disk
+     * flags for whether the index has data to be written to disk
      * @var int
      */
      var $index_dirty;
diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php
index 514c926ee..d402227ec 100755
--- a/controllers/admin_controller.php
+++ b/controllers/admin_controller.php
@@ -1358,6 +1358,10 @@ class AdminController extends Controller implements CrawlConstants
                         } else {
                             $clean_field = $_POST[$field];
                         }
+                        if($field == "QUEUE_SERVER" &&
+                            $clean_field[strlen($clean_field) -1] != "/") {
+                            $clean_field .= "/";
+                        }
                         $data[$field] = $clean_field;
                         $profile[$field] = $data[$field];
                         if($field == "MEMCACHE_SERVERS") {
diff --git a/lib/processors/bmp_processor.php b/lib/processors/bmp_processor.php
new file mode 100644
index 000000000..5f8ac4cd7
--- /dev/null
+++ b/lib/processors/bmp_processor.php
@@ -0,0 +1,77 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage processor
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/** Used for the getDocumentFilename method in UrlParser */
+require_once BASE_DIR."/lib/url_parser.php";
+/** Load base class, if needed */
+require_once BASE_DIR."/lib/processors/image_processor.php";
+
+/**
+ * Used to create crawl summary information
+ * for BMP and ICO files
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage processor
+ */
+class BmpProcessor extends ImageProcessor
+{
+
+    /**
+     * {@inheritdoc}
+     */
+    public static function process($page, $url)
+    {
+        if(is_string($page)) {
+            file_put_contents(CRAWL_DIR."/cache/tmp.bmp", $page);
+            $image = @imagecreatefrombmp(CRAWL_DIR."/cache/tmp.bmp");
+            $thumb_string = self::createThumb($image);
+            $summary[self::TITLE] = "";
+            $summary[self::DESCRIPTION] = "Image of ".
+                UrlParser::getDocumentFilename($url);
+            $summary[self::LINKS] = array();
+            $summary[self::PAGE] =
+                "<html><body><div><img src='data:image/bmp;base64," .
+                base64_encode($page)."' alt='".$summary[self::DESCRIPTION].
+                "' /></div></body></html>";
+            $summary[self::THUMB] = 'data:image/jpeg;base64,'.
+                base64_encode($thumb_string);
+        }
+        return $summary;
+    }
+
+}
+
+?>
diff --git a/lib/processors/rss_processor.php b/lib/processors/rss_processor.php
new file mode 100644
index 000000000..f3c054fe6
--- /dev/null
+++ b/lib/processors/rss_processor.php
@@ -0,0 +1,248 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage processor
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * Load base class, if needed.
+ */
+require_once BASE_DIR."/lib/processors/text_processor.php";
+/**
+ * Load so can parse urls
+ */
+require_once BASE_DIR."/lib/url_parser.php";
+
+ /**
+ * Used to create crawl summary information
+ * for RSS files
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage processor
+ */
+class RssProcessor extends TextProcessor
+{
+    const MAX_DESCRIPTION_LEN = 2000;
+
+
+    /**
+     *  Used to extract the title, description and links from
+     *  a string consisting of rss news feed data.
+     *
+     *  @param string $page   web-page contents
+     *  @param string $url   the url where the page contents came from,
+     *     used to canonicalize relative links
+     *
+     *  @return array  a summary of the contents of the page
+     *
+     */
+    public static function process($page, $url)
+    {
+        $summary = NULL;
+        if(is_string($page)) {
+            $dom = self::dom($page);
+
+            if($dom !==false) {
+                $summary[self::TITLE] = self::title($dom);
+                $summary[self::DESCRIPTION] = self::description($dom);
+                $summary[self::LINKS] = self::links($dom, $url);
+
+                if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
+                    == 0 && count($summary[self::LINKS]) == 0) {
+                    //maybe not rss? treat as text still try to get urls
+                    $summary = parent::process($page, $url);
+                }
+            }
+        }
+
+        return $summary;
+
+    }
+
+
+
+    /**
+     * Return a document object based on a string containing the contents of
+     * an RSS page
+     *
+     *  @param string $page   a web page
+     *
+     *  @return object  document object
+     */
+    static function dom($page)
+    {
+        $dom = new DOMDocument();
+
+        @$dom->loadXML($page);
+
+        return $dom;
+    }
+
+
+    /**
+     *  Returns html head title of a webpage based on its document object
+     *
+     *  @param object $dom   a document object to extract a title from.
+     *  @return string  a title of the page
+     *
+     */
+    static function title($dom)
+    {
+        $sites = array();
+
+        $xpath = new DOMXPath($dom);
+        $titles = $xpath->evaluate("/rss/channel/title");
+
+        $title = "";
+
+        foreach($titles as $pre_title) {
+            $title .= $pre_title->textContent;
+        }
+
+        return $title;
+    }
+
+    /**
+     * Returns descriptive text concerning a webpage based on its document
+     * object
+     *
+     * @param object $dom   a document object to extract a description from.
+     * @return string a description of the page
+     */
+    static function description($dom) {
+        $sites = array();
+
+        $xpath = new DOMXPath($dom);
+
+        $description = "";
+
+        /*
+          concatenate the contents of then additional dom elements up to
+          the limit of description length
+        */
+        $page_parts = array("/rss/channel/description",
+            "/rss/channel/category", "/rss/channel/lastBuildDate",
+            "/rss/channel/copyright");
+        foreach($page_parts as $part) {
+            $doc_nodes = $xpath->evaluate($part);
+            foreach($doc_nodes as $node) {
+                $description .= " ".$node->textContent;
+                if(strlen($description) > self::MAX_DESCRIPTION_LEN) { break 2;}
+            }
+        }
+        $description = mb_ereg_replace("(\s)+", " ",  $description);
+
+        return $description;
+    }
+
+    /**
+     * Returns up to MAX_LINK_PER_PAGE many links from the supplied
+     * dom object where links have been canonicalized according to
+     * the supplied $site information.
+     *
+     * @param object $dom   a document object with links on it
+     * @param string $site   a string containing a url
+     *
+     * @return array   links from the $dom object
+     */
+    static function links($dom, $site)
+    {
+        $sites = array();
+
+        $xpath = new DOMXPath($dom);
+
+        $link_nodes = array(
+            "/rss/channel" => array( "url" =>"link", "text" => "title"),
+            "/rss/channel/image" => array( "url" =>"url", "text" => "title"),
+            "/rss/channel/item" => array( "url" =>"link", "text" => "title"),
+        );
+
+        $i = 0;
+
+        foreach($link_nodes as $path => $url_text_pair) {
+            $nodes = $xpath->evaluate($path);
+            foreach($nodes as $node) {
+                $result = self::linkAndTexts($node,
+                    $url_text_pair['url'], $url_text_pair['text'], $site);
+                if($result != false) {
+                    list($url, $text) = $result;
+                    $sites[$url] = $text;
+                    $i++;
+                }
+                if($i >= MAX_LINKS_PER_PAGE) {
+                    break 2;
+                }
+            }
+
+        }
+
+       return $sites;
+    }
+
+   /**
+     * Returns a url text pair where the url comes from the link of
+     * the given item node and the text comes from the text data for that node.
+     * urls are canonicalized according to site.
+     *
+     * @param object $item_node the DOMNode to get a link and text from
+     * @param string $link_name name of link tag
+     * @param string $text_name name of text tag to associate with link
+     * @param string $site   a string containing a url
+     *
+     * @return array   a url,text pair
+     */
+    static function linkAndTexts($item_node, $link_name, $text_name, $site)
+    {
+        foreach($item_node->childNodes as $node) {
+            if($node->nodeName == $link_name) {
+                $url = UrlParser::canonicalLink(
+                    $node->textContent, $site);
+                if($url === NULL || $url === "" ||
+                    UrlParser::checkRecursiveUrl($url)) {
+                    return false;
+                }
+            }
+            if($node->nodeName == $text_name) {
+                $text = $node->textContent;
+                if($text == "") {
+                    $text = "RSS Feed";
+                }
+            }
+        }
+        $text = mb_ereg_replace("(\s)+", " ",  $text);
+        return array($url, $text);
+    }
+
+}
+
+?>
diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php
new file mode 100644
index 000000000..def712d60
--- /dev/null
+++ b/lib/processors/sitemap_processor.php
@@ -0,0 +1,151 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage processor
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * Load base class, if needed.
+ */
+require_once BASE_DIR."/lib/processors/text_processor.php";
+/**
+ * Load so can parse urls
+ */
+require_once BASE_DIR."/lib/url_parser.php";
+
+ /**
+ * Used to create crawl summary information
+ * for RSS files
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage processor
+ */
+class SitemapProcessor extends TextProcessor
+{
+
+    /**
+     *  Used to extract the title, description and links from
+     *  a string consisting of rss news feed data.
+     *
+     *  @param string $page   web-page contents
+     *  @param string $url   the url where the page contents came from,
+     *     used to canonicalize relative links
+     *
+     *  @return array  a summary of the contents of the page
+     *
+     */
+    public static function process($page, $url)
+    {
+        $summary = NULL;
+        if(is_string($page)) {
+            $dom = self::dom($page);
+
+            if($dom !==false) {
+                $summary[self::TITLE] = $url;
+                $summary[self::DESCRIPTION] = "Sitemap of ".$url;
+                $summary[self::LINKS] = self::links($dom, $url);
+
+                if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
+                    == 0 && count($summary[self::LINKS]) == 0) {
+                    //maybe not a sitemap? treat as text still try to get urls
+                    $summary = parent::process($page, $url);
+                }
+            }
+        }
+
+        return $summary;
+
+    }
+
+
+
+    /**
+     * Return a document object based on a string containing the contents of
+     * an RSS page
+     *
+     *  @param string $page   a web page
+     *
+     *  @return object  document object
+     */
+    static function dom($page)
+    {
+        $dom = new DOMDocument();
+
+        @$dom->loadXML($page);
+
+        return $dom;
+    }
+
+
+    /**
+     * Returns links from the supplied dom object of a sitemap
+     * where links have been canonicalized according to
+     * the supplied $site information. We allow more links from a sitemap
+     * than from other kinds of documents. For now we are ignoring weighting
+     * info
+     *
+     * @param object $dom   a document object with links on it
+     * @param string $site   a string containing a url
+     *
+     * @return array   links from the $dom object
+     */
+    static function links($dom, $site)
+    {
+        $sites = array();
+
+        $xpath = new DOMXPath($dom);
+        $xpath->registerNamespace('s',
+            "http://www.sitemaps.org/schemas/sitemap/0.9");
+        $paths = array(
+            "/s:urlset/s:url/s:loc",
+            "/s:sitemapindex/s:sitemap/s:loc"
+        );
+
+        foreach($paths as $path) {
+            $nodes = $xpath->evaluate($path);
+            echo "hi".$nodes->length;
+            foreach($nodes as $node) {
+                $url = UrlParser::canonicalLink(
+                    $node->textContent, $site);
+                if($url === NULL || $url === "" ||
+                    UrlParser::checkRecursiveUrl($url)) {
+                    continue;
+                }
+                $sites[$url] = "From sitemap of ".$site;
+            }
+
+        }
+        return $sites;
+    }
+}
+?>
diff --git a/lib/processors/svg_processor.php b/lib/processors/svg_processor.php
new file mode 100644
index 000000000..b54bc6be2
--- /dev/null
+++ b/lib/processors/svg_processor.php
@@ -0,0 +1,216 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage processor
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * Load base class, if needed.
+ */
+require_once BASE_DIR."/lib/processors/text_processor.php";
+
+/**
+ * Load so can parse urls
+ */
+require_once BASE_DIR."/lib/url_parser.php";
+
+/**
+ * Used for convertPixels
+ */
+require_once BASE_DIR."/lib/utility.php";
+
+
+ /**
+ * Used to create crawl summary information
+ * for SVG files. This class is a little bit
+ * weird in that it generates thumbs like the
+ * image processor classes, but when it gives
+ * up on the data it falls back to text
+ * processor handling.
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage processor
+ */
+class SvgProcessor extends TextProcessor
+{
+    const MAX_DESCRIPTION_LEN = 2000;
+    const MAX_THUMB_LEN = 5000;
+
+
+    /**
+     *  Used to extract the title, description and links from
+     *  a string consisting of svg image. If the image is small
+     *  enough, an attempt is made to generate a thumbnail
+     *
+     *  @param string $page   web-page contents
+     *  @param string $url   the url where the page contents came from,
+     *     used to canonicalize relative links
+     *
+     *  @return array  a summary of the contents of the page
+     *
+     */
+    public static function process($page, $url)
+    {
+
+        if(is_string($page)) {
+            self::closeDanglingTags($page);
+            $dom = self::dom($page);
+
+            if($dom !== false && isset($dom->documentElement)) {
+                $summary[self::TITLE] = "";
+                $summary[self::DESCRIPTION] = self::description($dom);
+                $summary[self::LINKS] = array();
+                $summary[self::PAGE] =
+                    "<html><body><div><img src='data:image/svg+xml;base64," .
+                    base64_encode($page)."' alt='".$summary[self::DESCRIPTION].
+                    "' /></div></body></html>";
+                if(strlen($page) < self::MAX_THUMB_LEN) {
+                    $thumb_string = self::createThumb($dom);
+                    $summary[self::THUMB] = 'data:image/svg+xml;base64,'.
+                        base64_encode($thumb_string);
+                }
+            }else {
+                $summary = parent::process($page, $url);
+            }
+        }
+        return $summary;
+    }
+
+    /**
+     * Used to create an svg thumbnail from a dom object
+     *
+     * @param object $dom a dom svg image object
+     *
+     */
+    static function createThumb($dom)
+    {
+        $svg = $dom->documentElement;
+        if($svg->hasAttribute("width")) {
+            $width = $svg->getAttribute("width");
+        } else {
+            $width = 600;
+        }
+        $width = convertPixels($width);
+        if($svg->hasAttribute("height")) {
+            $height = $svg->getAttribute("height");
+        } else {
+            $height = 600;
+        }
+        $height = convertPixels($height);
+        $svg->setAttributeNS("", "width", "150px");
+        $svg->setAttributeNS("", "height", "150px");
+        if(!$svg->hasAttribute("viewBox")) {
+            $svg->setAttributeNS("", "viewBox", "0 0 $width $height");
+        }
+
+        return $dom->saveXML();
+
+    }
+
+
+    /**
+     * Return a document object based on a string containing the contents of
+     * an SVG page
+     *
+     *  @param string $page   a web page
+     *
+     *  @return object  document object
+     */
+    static function dom($page)
+    {
+        $dom = new DOMDocument();
+
+        @$dom->loadXML($page);
+
+        return $dom;
+    }
+
+
+    /**
+     *  Returns html head title of a webpage based on its document object
+     *
+     *  @param object $dom   a document object to extract a title from.
+     *  @return string  a title of the page
+     *
+     */
+    static function title($dom)
+    {
+        $sites = array();
+
+        $xpath = new DOMXPath($dom);
+        $titles = $xpath->evaluate("/svg//desc");
+
+        $title = "";
+
+        foreach($titles as $pre_title) {
+            $title .= $pre_title->textContent;
+        }
+
+        return $title;
+    }
+
+    /**
+     * Returns descriptive text concerning a svg page based on its document
+     * object
+     *
+     * @param object $dom   a document object to extract a description from.
+     * @return string a description of the page
+     */
+    static function description($dom) {
+        $sites = array();
+
+        $xpath = new DOMXPath($dom);
+
+        $description = "";
+
+        /*
+          concatenate the contents of then additional dom elements up to
+          the limit of description length
+        */
+        $page_parts = array("/svg//desc",
+            "/svg//text");
+        foreach($page_parts as $part) {
+            $doc_nodes = $xpath->evaluate($part);
+            foreach($doc_nodes as $node) {
+                $description .= " ".$node->textContent;
+                if(strlen($description) > self::MAX_DESCRIPTION_LEN) { break 2;}
+            }
+        }
+        $description = mb_ereg_replace("(\s)+", " ",  $description);
+
+        return $description;
+    }
+
+}
+
+?>
diff --git a/lib/processors/xml_processor.php b/lib/processors/xml_processor.php
new file mode 100644
index 000000000..79cd166ec
--- /dev/null
+++ b/lib/processors/xml_processor.php
@@ -0,0 +1,141 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage processor
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * Load base class, if needed.
+ */
+require_once BASE_DIR."/lib/processors/text_processor.php";
+
+/**
+ * If XML turns out to be RSS ...
+ */
+require_once BASE_DIR."/lib/processors/rss_processor.php";
+
+/**
+ * If XML turns out to be XHTML ...
+ */
+require_once BASE_DIR."/lib/processors/html_processor.php";
+
+
+/**
+ * Load so can parse urls
+ */
+require_once BASE_DIR."/lib/url_parser.php";
+
+ /**
+ * Used to create crawl summary information
+ * for XML files (those served as text/xml)
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage processor
+ */
+class XmlProcessor extends TextProcessor
+{
+    const MAX_DESCRIPTION_LEN = 2000;
+
+
+    /**
+     *  Used to extract the title, description and links from
+     *  a string consisting of rss news feed data.
+     *
+     *  @param string $page   web-page contents
+     *  @param string $url   the url where the page contents came from,
+     *     used to canonicalize relative links
+     *
+     *  @return array  a summary of the contents of the page
+     *
+     */
+    public static function process($page, $url)
+    {
+        $summary = NULL;
+        if(is_string($page)) {
+            self::closeDanglingTags($page);
+
+            $dom = self::dom($page);
+
+            $root_name = isset($dom->documentElement->nodeName) ?
+                $dom->documentElement->nodeName : "";
+            echo $root_name;
+            unset($dom);
+
+            switch ($root_name)
+            {
+                case "rss":
+                    $summary = RssProcessor::process($page, $url);
+                break;
+                case "html":
+                    $summary = HtmlProcessor::process($page, $url);
+                break;
+                case "sitemapindex":
+                    $summary = SitemapProcessor::process($page, $url);
+                break;
+                case "urlset":
+                    $summary = SitemapProcessor::process($page, $url);
+                break;
+                case "svg":
+                    $summary = SvgProcessor::process($page, $url);
+                break;
+                default:
+                    $summary = parent::process($page, $url);
+            }
+        }
+
+        return $summary;
+
+    }
+
+
+
+    /**
+     * Return a document object based on a string containing the contents of
+     * an XML page
+     *
+     *  @param string $page   a web page
+     *
+     *  @return object  document object
+     */
+    static function dom($page)
+    {
+        $dom = new DOMDocument();
+
+        @$dom->loadXML($page);
+
+        return $dom;
+    }
+
+}
+
+?>

ViewGit