diff --git a/bin/arc_tool.php b/bin/arc_tool.php new file mode 100644 index 000000000..4e7c6be9b --- /dev/null +++ b/bin/arc_tool.php @@ -0,0 +1,350 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage bin + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +/** Calculate base directory of script */ +define("BASE_DIR", substr( + dirname(realpath($_SERVER['PHP_SELF'])), 0, + -strlen("/bin"))); + +/** Load in global configuration settings */ +require_once BASE_DIR.'/configs/config.php'; +if(!PROFILE) { + echo "Please configure the search engine instance by visiting" . + "its web interface on localhost.\n"; + exit(); +} + +/** Load the class that maintains our URL queue */ +require_once BASE_DIR."/lib/web_queue_bundle.php"; + +/** Load word->{array of docs with word} index class */ +require_once BASE_DIR."/lib/index_archive_bundle.php"; + +/** Used for manipulating urls*/ +require_once BASE_DIR."/lib/url_parser.php"; + +/** For crawlHash function */ +require_once BASE_DIR."/lib/utility.php"; + +/** Loads common constants for web crawling*/ +require_once BASE_DIR."/lib/crawl_constants.php"; + +/* + * We'll set up multi-byte string handling to use UTF-8 + */ +mb_internal_encoding("UTF-8"); +mb_regex_encoding("UTF-8"); + +/** + * Command line program that allows one to examine the content of + * the WebArchiveBundles and IndexArchiveBundles of Yioop crawls. + * For now it supports returning header information about bundles, + * as well as pretty printing the page/summary contents of the bundle. + * + * The former can be gotten from a bundle by running arc_tool with a + * command like: + * php arc_tool.php info bundle_name + * + * The latter can be gotten from a bundle by running arc_tool with a + * command like: + * php arc_tool.php list bundle_name start_doc_num num_results + * + * @author Chris Pollett + * @package seek_quarry + */ +class ArcTool implements CrawlConstants +{ + + /** + * The maximum number of documents the arc_tool list function + * will read into memory in one go. + */ + const MAX_BUFFER_DOCS = 200; + + /** + * Initializes the ArcTool, for now does nothing + */ + function __construct() + { + + } + + /** + * Runs the ArcTool on the supplied command line arguments + */ + function start() + { + global $argv; + + if(isset($_SERVER['DOCUMENT_ROOT']) && + strlen($_SERVER['DOCUMENT_ROOT']) > 0) { + echo "BAD REQUEST"; + exit(); + } + + if(!isset($argv[1])) { + usageMessageAndExit(); + } + + switch($argv[1]) + { + case "info": + if(!isset($argv[2]) ) { + $tis->usageMessageAndExit(); + } + $this->outputInfo($argv[2]); + break; + + case "list": + if(!isset($argv[2]) || !isset($argv[3])) { + $this->usageMessageAndExit(); + } + $this->outputList($argv[2], $argv[3], $argv[4]); + break; + + default: + $this->usageMessageAndExit(); + } + + } + + /** + * Determines whether the supplied name is a WebArchiveBundle or + * an IndexArchiveBundle. Then outputsto stdout header information about the + * bundle by calling the appropriate sub-function. + * + * @param string $archive_name the name of a directory that holds + * WebArchiveBundle or IndexArchiveBundle data + */ + function outputInfo($archive_name) + { + $bundle_name = UrlParser::getDocumentFilename($archive_name); + echo "Bundle Name: ".$bundle_name."\n"; + $archive_type = $this->getArchiveKind($archive_name); + echo "Bundle Type: ".$archive_type."\n"; + if($archive_type === false) { + $this->badFormatMessageAndExit($archive_name); + } + $call = "outputInfo".$archive_type; + $info = $archive_type::getArchiveInfo($archive_name); + $this->$call($info, $archive_name); + } + + /** + * Outputs to stdout header information for a IndexArchiveBundle + * bundle. + * + * @param array $info header info that has already been read from + * the description.txt file + * @param string $archive_name the name of the folder containing the bundle + */ + function outputInfoIndexArchiveBundle($info, $archive_name) + { + $more_info = unserialize($info['DESCRIPTION']); + unset($info['DESCRIPTION']); + $info = array_merge($info, $more_info); + echo "Description: ".$info['DESCRIPTION']."\n"; + $generation_info = unserialize( + file_get_contents("$archive_name/generation.txt")); + $num_generations = $generation_info['ACTIVE']+1; + echo "Number of generations: ".$num_generations."\n"; + echo "Number of stored links and documents: ".$info['COUNT']."\n"; + echo "Number of stored documents: ".$info['VISITED_URLS_COUNT']."\n"; + $crawl_order = ($info[self::CRAWL_ORDER] == self::BREADTH_FIRST) ? + "Bread First" : "Page Importance"; + echo "Crawl order was: $crawl_order\n"; + echo "Seed sites:\n"; + foreach($info[self::TO_CRAWL] as $seed) { + echo " $seed\n"; + } + if($info[self::RESTRICT_SITES_BY_URL]) { + echo "Sites allowed to crawl:\n"; + foreach($info[self::ALLOWED_SITES] as $site) { + echo " $site\n"; + } + } + echo "Sites not allowed to be crawled:\n"; + foreach($info[self::DISALLOWED_SITES] as $site) { + echo " $site\n"; + } + echo "Meta Words:\n"; + foreach($info[self::META_WORDS] as $word) { + echo " $word\n"; + } + echo "\n"; + } + + /** + * Outputs to stdout header information for a WebArchiveBundle + * bundle. + * + * @param array $info header info that has already been read from + * the description.txt file + * @param string $archive_name the name of the folder containing the bundle + + */ + function outputInfoWebArchiveBundle($info, $archive_name) + { + echo "Description: ".$info['DESCRIPTION']."\n"; + echo "Number of stored documents: ".$info['COUNT']."\n"; + echo "Maximum Number of documents per partition: ". + $info['NUM_DOCS_PER_PARTITION']."\n"; + echo "Number of partitions: ". + ($info['WRITE_PARTITION']+1)."\n"; + echo "\n"; + } + + /** + * Used to list out the pages/summaries stored in a bundle + * $archive_name. It lists to stdout $num many documents starting at $start. + * + * @param string $archive_name name of bundle to list documents for + * @param int $start first document to list + * @param int $num number of documents to list + */ + function outputList($archive_name, $start, $num) + { + $fields_to_print = array( + self::URL => "URL", + self::HTTP_CODE => "HTTP RESPONSE CODE", + self::TYPE => "MIMETYPE", + self::ENCODING => "CHARACTER ENCODING", + self::DESCRIPTION => "DESCRIPTION", + self::PAGE => "PAGE DATA"); + $archive_type = $this->getArchiveKind($archive_name); + if($archive_type === false) { + $this->badFormatMessageAndExit($archive_name); + } + $info = $archive_type::getArchiveInfo($archive_name); + $num = min($num, $info["COUNT"] - $start); + + if($archive_type == "IndexArchiveBundle") { + $generation_info = unserialize( + file_get_contents("$archive_name/generation.txt")); + $num_generations = $generation_info['ACTIVE']+1; + $archive = new WebArchiveBundle($archive_name."/summaries"); + } else { + $num_generations = $info["WRITE_PARTITION"]+1; + $archive = new WebArchiveBundle($archive_name); + } + $num = max($num, 0); + $total = $start + $num; + $seen = 0; + $generation = 0; + while($seen < $total && $generation < $num_generations) { + $partition = $archive->getPartition($generation, false); + if($archive->count < $start && $seen < $start) { + $generation++; + $seen += $this->count; + continue; + } + $seen_generation = 0; + while($seen < $total && $seen_generation < $archive->count) { + $num_to_get = min($total - $seen, + $archive->count - $seen_generation, + self::MAX_BUFFER_DOCS); + $objects = $partition->nextObjects($num_to_get); + $seen += $num_to_get; + $seen_generation += $num_to_get; + if($seen > $start) { + $num_to_show = min($seen - $start, $num_to_get); + $cnt = 0; + $first = $num_to_get - $num_to_show; + foreach($objects as $object) { + if($cnt >= $first) { + $out = ""; + foreach($fields_to_print as $key => $name) { + if(isset($object[1][$key])) { + $out .= "[$name]\n"; + $out .= $object[1][$key]."\n"; + } + } + $out .= "==========\n\n"; + echo "BEGIN ITEM, LENGTH:".strlen($out)."\n"; + echo $out; + } + $cnt++; + } + } + } + $generation++; + } + } + + /** + * Given a folder name, determines the kind of bundle (if any) it holds. + * It does this based on the expected location of the description.txt file. + * + * @param string $archive_name the name of folder + * @return string the archive bundle type, either: WebArchiveBundle or + * IndexArchiveBundle + */ + function getArchiveKind($archive_name) + { + if(file_exists("$archive_name/description.txt")) { + return "WebArchiveBundle"; + } + if(file_exists("$archive_name/summaries/description.txt")) { + return "IndexArchiveBundle"; + } + return false; + } + + /** + * Outputs the "hey, this isn't a known bundle message" and then exit()'s. + */ + function badFormatMessageAndExit($archive_name) + { + echo "$archive_name does not appear to be a web or index ". + "archive bundle\n"; + exit(); + } + + /** + * Outputs the "how to use this tool message" and then exit()'s. + */ + function usageMessageAndExit() + { + echo "arc_tool is used to look at the contents of"; + echo " WebArchiveBundles and IndexArchiveBundles.\n For example,\n"; + echo "php arc_tool.php info bundle_name //return info about ". + "documents stored in archive.\n"; + echo "php arc_tool.php list bundle_name start num //outputs". + " items start through num.from bundle_name\n"; + exit(); + } +} + +$arc_tool = new ArcTool(); +$arc_tool->start(); + diff --git a/bin/queue_server.php b/bin/queue_server.php index 5f8ae33cb..9b39cc87f 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -164,7 +164,7 @@ class QueueServer implements CrawlConstants */ var $last_index_save_time; /** - * flasg for whether the index has data to be written to disk + * flags for whether the index has data to be written to disk * @var int */ var $index_dirty; diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php index 514c926ee..d402227ec 100755 --- a/controllers/admin_controller.php +++ b/controllers/admin_controller.php @@ -1358,6 +1358,10 @@ class AdminController extends Controller implements CrawlConstants } else { $clean_field = $_POST[$field]; } + if($field == "QUEUE_SERVER" && + $clean_field[strlen($clean_field) -1] != "/") { + $clean_field .= "/"; + } $data[$field] = $clean_field; $profile[$field] = $data[$field]; if($field == "MEMCACHE_SERVERS") { diff --git a/lib/processors/bmp_processor.php b/lib/processors/bmp_processor.php new file mode 100644 index 000000000..5f8ac4cd7 --- /dev/null +++ b/lib/processors/bmp_processor.php @@ -0,0 +1,77 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage processor + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** Used for the getDocumentFilename method in UrlParser */ +require_once BASE_DIR."/lib/url_parser.php"; +/** Load base class, if needed */ +require_once BASE_DIR."/lib/processors/image_processor.php"; + +/** + * Used to create crawl summary information + * for BMP and ICO files + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage processor + */ +class BmpProcessor extends ImageProcessor +{ + + /** + * {@inheritdoc} + */ + public static function process($page, $url) + { + if(is_string($page)) { + file_put_contents(CRAWL_DIR."/cache/tmp.bmp", $page); + $image = @imagecreatefrombmp(CRAWL_DIR."/cache/tmp.bmp"); + $thumb_string = self::createThumb($image); + $summary[self::TITLE] = ""; + $summary[self::DESCRIPTION] = "Image of ". + UrlParser::getDocumentFilename($url); + $summary[self::LINKS] = array(); + $summary[self::PAGE] = + "<html><body><div><img src='data:image/bmp;base64," . + base64_encode($page)."' alt='".$summary[self::DESCRIPTION]. + "' /></div></body></html>"; + $summary[self::THUMB] = 'data:image/jpeg;base64,'. + base64_encode($thumb_string); + } + return $summary; + } + +} + +?> diff --git a/lib/processors/rss_processor.php b/lib/processors/rss_processor.php new file mode 100644 index 000000000..f3c054fe6 --- /dev/null +++ b/lib/processors/rss_processor.php @@ -0,0 +1,248 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage processor + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * Load base class, if needed. + */ +require_once BASE_DIR."/lib/processors/text_processor.php"; +/** + * Load so can parse urls + */ +require_once BASE_DIR."/lib/url_parser.php"; + + /** + * Used to create crawl summary information + * for RSS files + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage processor + */ +class RssProcessor extends TextProcessor +{ + const MAX_DESCRIPTION_LEN = 2000; + + + /** + * Used to extract the title, description and links from + * a string consisting of rss news feed data. + * + * @param string $page web-page contents + * @param string $url the url where the page contents came from, + * used to canonicalize relative links + * + * @return array a summary of the contents of the page + * + */ + public static function process($page, $url) + { + $summary = NULL; + if(is_string($page)) { + $dom = self::dom($page); + + if($dom !==false) { + $summary[self::TITLE] = self::title($dom); + $summary[self::DESCRIPTION] = self::description($dom); + $summary[self::LINKS] = self::links($dom, $url); + + if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) + == 0 && count($summary[self::LINKS]) == 0) { + //maybe not rss? treat as text still try to get urls + $summary = parent::process($page, $url); + } + } + } + + return $summary; + + } + + + + /** + * Return a document object based on a string containing the contents of + * an RSS page + * + * @param string $page a web page + * + * @return object document object + */ + static function dom($page) + { + $dom = new DOMDocument(); + + @$dom->loadXML($page); + + return $dom; + } + + + /** + * Returns html head title of a webpage based on its document object + * + * @param object $dom a document object to extract a title from. + * @return string a title of the page + * + */ + static function title($dom) + { + $sites = array(); + + $xpath = new DOMXPath($dom); + $titles = $xpath->evaluate("/rss/channel/title"); + + $title = ""; + + foreach($titles as $pre_title) { + $title .= $pre_title->textContent; + } + + return $title; + } + + /** + * Returns descriptive text concerning a webpage based on its document + * object + * + * @param object $dom a document object to extract a description from. + * @return string a description of the page + */ + static function description($dom) { + $sites = array(); + + $xpath = new DOMXPath($dom); + + $description = ""; + + /* + concatenate the contents of then additional dom elements up to + the limit of description length + */ + $page_parts = array("/rss/channel/description", + "/rss/channel/category", "/rss/channel/lastBuildDate", + "/rss/channel/copyright"); + foreach($page_parts as $part) { + $doc_nodes = $xpath->evaluate($part); + foreach($doc_nodes as $node) { + $description .= " ".$node->textContent; + if(strlen($description) > self::MAX_DESCRIPTION_LEN) { break 2;} + } + } + $description = mb_ereg_replace("(\s)+", " ", $description); + + return $description; + } + + /** + * Returns up to MAX_LINK_PER_PAGE many links from the supplied + * dom object where links have been canonicalized according to + * the supplied $site information. + * + * @param object $dom a document object with links on it + * @param string $site a string containing a url + * + * @return array links from the $dom object + */ + static function links($dom, $site) + { + $sites = array(); + + $xpath = new DOMXPath($dom); + + $link_nodes = array( + "/rss/channel" => array( "url" =>"link", "text" => "title"), + "/rss/channel/image" => array( "url" =>"url", "text" => "title"), + "/rss/channel/item" => array( "url" =>"link", "text" => "title"), + ); + + $i = 0; + + foreach($link_nodes as $path => $url_text_pair) { + $nodes = $xpath->evaluate($path); + foreach($nodes as $node) { + $result = self::linkAndTexts($node, + $url_text_pair['url'], $url_text_pair['text'], $site); + if($result != false) { + list($url, $text) = $result; + $sites[$url] = $text; + $i++; + } + if($i >= MAX_LINKS_PER_PAGE) { + break 2; + } + } + + } + + return $sites; + } + + /** + * Returns a url text pair where the url comes from the link of + * the given item node and the text comes from the text data for that node. + * urls are canonicalized according to site. + * + * @param object $item_node the DOMNode to get a link and text from + * @param string $link_name name of link tag + * @param string $text_name name of text tag to associate with link + * @param string $site a string containing a url + * + * @return array a url,text pair + */ + static function linkAndTexts($item_node, $link_name, $text_name, $site) + { + foreach($item_node->childNodes as $node) { + if($node->nodeName == $link_name) { + $url = UrlParser::canonicalLink( + $node->textContent, $site); + if($url === NULL || $url === "" || + UrlParser::checkRecursiveUrl($url)) { + return false; + } + } + if($node->nodeName == $text_name) { + $text = $node->textContent; + if($text == "") { + $text = "RSS Feed"; + } + } + } + $text = mb_ereg_replace("(\s)+", " ", $text); + return array($url, $text); + } + +} + +?> diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php new file mode 100644 index 000000000..def712d60 --- /dev/null +++ b/lib/processors/sitemap_processor.php @@ -0,0 +1,151 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage processor + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * Load base class, if needed. + */ +require_once BASE_DIR."/lib/processors/text_processor.php"; +/** + * Load so can parse urls + */ +require_once BASE_DIR."/lib/url_parser.php"; + + /** + * Used to create crawl summary information + * for RSS files + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage processor + */ +class SitemapProcessor extends TextProcessor +{ + + /** + * Used to extract the title, description and links from + * a string consisting of rss news feed data. + * + * @param string $page web-page contents + * @param string $url the url where the page contents came from, + * used to canonicalize relative links + * + * @return array a summary of the contents of the page + * + */ + public static function process($page, $url) + { + $summary = NULL; + if(is_string($page)) { + $dom = self::dom($page); + + if($dom !==false) { + $summary[self::TITLE] = $url; + $summary[self::DESCRIPTION] = "Sitemap of ".$url; + $summary[self::LINKS] = self::links($dom, $url); + + if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) + == 0 && count($summary[self::LINKS]) == 0) { + //maybe not a sitemap? treat as text still try to get urls + $summary = parent::process($page, $url); + } + } + } + + return $summary; + + } + + + + /** + * Return a document object based on a string containing the contents of + * an RSS page + * + * @param string $page a web page + * + * @return object document object + */ + static function dom($page) + { + $dom = new DOMDocument(); + + @$dom->loadXML($page); + + return $dom; + } + + + /** + * Returns links from the supplied dom object of a sitemap + * where links have been canonicalized according to + * the supplied $site information. We allow more links from a sitemap + * than from other kinds of documents. For now we are ignoring weighting + * info + * + * @param object $dom a document object with links on it + * @param string $site a string containing a url + * + * @return array links from the $dom object + */ + static function links($dom, $site) + { + $sites = array(); + + $xpath = new DOMXPath($dom); + $xpath->registerNamespace('s', + "http://www.sitemaps.org/schemas/sitemap/0.9"); + $paths = array( + "/s:urlset/s:url/s:loc", + "/s:sitemapindex/s:sitemap/s:loc" + ); + + foreach($paths as $path) { + $nodes = $xpath->evaluate($path); + echo "hi".$nodes->length; + foreach($nodes as $node) { + $url = UrlParser::canonicalLink( + $node->textContent, $site); + if($url === NULL || $url === "" || + UrlParser::checkRecursiveUrl($url)) { + continue; + } + $sites[$url] = "From sitemap of ".$site; + } + + } + return $sites; + } +} +?> diff --git a/lib/processors/svg_processor.php b/lib/processors/svg_processor.php new file mode 100644 index 000000000..b54bc6be2 --- /dev/null +++ b/lib/processors/svg_processor.php @@ -0,0 +1,216 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage processor + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * Load base class, if needed. + */ +require_once BASE_DIR."/lib/processors/text_processor.php"; + +/** + * Load so can parse urls + */ +require_once BASE_DIR."/lib/url_parser.php"; + +/** + * Used for convertPixels + */ +require_once BASE_DIR."/lib/utility.php"; + + + /** + * Used to create crawl summary information + * for SVG files. This class is a little bit + * weird in that it generates thumbs like the + * image processor classes, but when it gives + * up on the data it falls back to text + * processor handling. + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage processor + */ +class SvgProcessor extends TextProcessor +{ + const MAX_DESCRIPTION_LEN = 2000; + const MAX_THUMB_LEN = 5000; + + + /** + * Used to extract the title, description and links from + * a string consisting of svg image. If the image is small + * enough, an attempt is made to generate a thumbnail + * + * @param string $page web-page contents + * @param string $url the url where the page contents came from, + * used to canonicalize relative links + * + * @return array a summary of the contents of the page + * + */ + public static function process($page, $url) + { + + if(is_string($page)) { + self::closeDanglingTags($page); + $dom = self::dom($page); + + if($dom !== false && isset($dom->documentElement)) { + $summary[self::TITLE] = ""; + $summary[self::DESCRIPTION] = self::description($dom); + $summary[self::LINKS] = array(); + $summary[self::PAGE] = + "<html><body><div><img src='data:image/svg+xml;base64," . + base64_encode($page)."' alt='".$summary[self::DESCRIPTION]. + "' /></div></body></html>"; + if(strlen($page) < self::MAX_THUMB_LEN) { + $thumb_string = self::createThumb($dom); + $summary[self::THUMB] = 'data:image/svg+xml;base64,'. + base64_encode($thumb_string); + } + }else { + $summary = parent::process($page, $url); + } + } + return $summary; + } + + /** + * Used to create an svg thumbnail from a dom object + * + * @param object $dom a dom svg image object + * + */ + static function createThumb($dom) + { + $svg = $dom->documentElement; + if($svg->hasAttribute("width")) { + $width = $svg->getAttribute("width"); + } else { + $width = 600; + } + $width = convertPixels($width); + if($svg->hasAttribute("height")) { + $height = $svg->getAttribute("height"); + } else { + $height = 600; + } + $height = convertPixels($height); + $svg->setAttributeNS("", "width", "150px"); + $svg->setAttributeNS("", "height", "150px"); + if(!$svg->hasAttribute("viewBox")) { + $svg->setAttributeNS("", "viewBox", "0 0 $width $height"); + } + + return $dom->saveXML(); + + } + + + /** + * Return a document object based on a string containing the contents of + * an SVG page + * + * @param string $page a web page + * + * @return object document object + */ + static function dom($page) + { + $dom = new DOMDocument(); + + @$dom->loadXML($page); + + return $dom; + } + + + /** + * Returns html head title of a webpage based on its document object + * + * @param object $dom a document object to extract a title from. + * @return string a title of the page + * + */ + static function title($dom) + { + $sites = array(); + + $xpath = new DOMXPath($dom); + $titles = $xpath->evaluate("/svg//desc"); + + $title = ""; + + foreach($titles as $pre_title) { + $title .= $pre_title->textContent; + } + + return $title; + } + + /** + * Returns descriptive text concerning a svg page based on its document + * object + * + * @param object $dom a document object to extract a description from. + * @return string a description of the page + */ + static function description($dom) { + $sites = array(); + + $xpath = new DOMXPath($dom); + + $description = ""; + + /* + concatenate the contents of then additional dom elements up to + the limit of description length + */ + $page_parts = array("/svg//desc", + "/svg//text"); + foreach($page_parts as $part) { + $doc_nodes = $xpath->evaluate($part); + foreach($doc_nodes as $node) { + $description .= " ".$node->textContent; + if(strlen($description) > self::MAX_DESCRIPTION_LEN) { break 2;} + } + } + $description = mb_ereg_replace("(\s)+", " ", $description); + + return $description; + } + +} + +?> diff --git a/lib/processors/xml_processor.php b/lib/processors/xml_processor.php new file mode 100644 index 000000000..79cd166ec --- /dev/null +++ b/lib/processors/xml_processor.php @@ -0,0 +1,141 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage processor + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * Load base class, if needed. + */ +require_once BASE_DIR."/lib/processors/text_processor.php"; + +/** + * If XML turns out to be RSS ... + */ +require_once BASE_DIR."/lib/processors/rss_processor.php"; + +/** + * If XML turns out to be XHTML ... + */ +require_once BASE_DIR."/lib/processors/html_processor.php"; + + +/** + * Load so can parse urls + */ +require_once BASE_DIR."/lib/url_parser.php"; + + /** + * Used to create crawl summary information + * for XML files (those served as text/xml) + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage processor + */ +class XmlProcessor extends TextProcessor +{ + const MAX_DESCRIPTION_LEN = 2000; + + + /** + * Used to extract the title, description and links from + * a string consisting of rss news feed data. + * + * @param string $page web-page contents + * @param string $url the url where the page contents came from, + * used to canonicalize relative links + * + * @return array a summary of the contents of the page + * + */ + public static function process($page, $url) + { + $summary = NULL; + if(is_string($page)) { + self::closeDanglingTags($page); + + $dom = self::dom($page); + + $root_name = isset($dom->documentElement->nodeName) ? + $dom->documentElement->nodeName : ""; + echo $root_name; + unset($dom); + + switch ($root_name) + { + case "rss": + $summary = RssProcessor::process($page, $url); + break; + case "html": + $summary = HtmlProcessor::process($page, $url); + break; + case "sitemapindex": + $summary = SitemapProcessor::process($page, $url); + break; + case "urlset": + $summary = SitemapProcessor::process($page, $url); + break; + case "svg": + $summary = SvgProcessor::process($page, $url); + break; + default: + $summary = parent::process($page, $url); + } + } + + return $summary; + + } + + + + /** + * Return a document object based on a string containing the contents of + * an XML page + * + * @param string $page a web page + * + * @return object document object + */ + static function dom($page) + { + $dom = new DOMDocument(); + + @$dom->loadXML($page); + + return $dom; + } + +} + +?>