diff --git a/bin/queue_server.php b/bin/queue_server.php index 3f6e7f497..44d897579 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -574,7 +574,7 @@ class QueueServer implements CrawlConstants crawlLog("... less than max age\n"); } - crawlLog("Checking for Robot.txt files to process..."); + crawlLog("Checking for robots.txt files to process..."); $robot_dir = CRAWL_DIR."/schedules/". self::robot_data_base_name.$this->crawl_time; @@ -1002,7 +1002,7 @@ class QueueServer implements CrawlConstants } //if delay else } // if containsGotRobotTxt - // handle robot.txt urls + // handle robots.txt urls $i++; diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index 0b759e982..aeefd4967 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -276,7 +276,8 @@ class WordIterator implements IndexingConstants, CrawlConstants $this->index->num_partitions_index); if($info_block == NULL) { - $this->info_block = $this->index->getPhraseIndexInfo($this->word_key); + $this->info_block = + $this->index->getPhraseIndexInfo($this->word_key); } else { $this->info_block = $info_block; } @@ -375,6 +376,10 @@ class WordIterator implements IndexingConstants, CrawlConstants */ public function currentDocsWithWord($restrict_phrases = NULL) { + if($this->num_generations <= + $this->info_block['CURRENT_GENERATION_INDEX']) { + return -1; + } $generation = $this->info_block['GENERATIONS'][ $this->info_block['CURRENT_GENERATION_INDEX']]; @@ -958,7 +963,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants if($phrase_info == NULL || (isset($phrase_info[self::PARTIAL_COUNT]) && $phrase_info[self::PARTIAL_COUNT] < $limit + $num)) { - $this->addPhraseIndex( $word_key, $restrict_phrases, $phrase_key, $limit + $num); } diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index ec7930e8e..bc27150f3 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -52,6 +52,8 @@ require_once BASE_DIR."/lib/url_parser.php"; */ class HtmlProcessor extends TextProcessor { + const MAX_DESCRIPTION_LEN = 3000; + /** * Used to extract the title, description and links from @@ -136,7 +138,7 @@ class HtmlProcessor extends TextProcessor $sites = array(); $xpath = new DOMXPath($dom); - $titles = $xpath->evaluate("/html/head//title"); + $titles = $xpath->evaluate("/html//title"); $title = ""; @@ -158,7 +160,8 @@ class HtmlProcessor extends TextProcessor $sites = array(); $xpath = new DOMXPath($dom); - $metas = $xpath->evaluate("/html/head//meta"); + + $metas = $xpath->evaluate("/html//meta"); $description = ""; @@ -169,14 +172,22 @@ class HtmlProcessor extends TextProcessor } } - //concatenate the contents of all the h1, h2 tags in the document - $headings = $xpath->evaluate( - "/html/body//h1|/html/body//h2|/html/body//h3|/html/body//p[1]"); - - foreach($headings as $h) { - $description .= " ".$h->textContent; + /* + concatenate the contents of then additional dom elements up to + the limit of description length + */ + $page_parts = array("/html//h1", "/html//h2", "/html//h3", + "/html//h4", "/html//h5", "/html//h6", "/html//p[1]", + "/html//div[1]", "/html//p[2]", "/html//div[2]", + "/html//td"); + foreach($page_parts as $part) { + $doc_nodes = $xpath->evaluate($part); + foreach($doc_nodes as $node) { + $description .= " ".$node->textContent; + if(strlen($description) > self::MAX_DESCRIPTION_LEN) { break 2;} + } } - $description = mb_ereg_replace("(\s)+", " ", $description); + $description = mb_ereg_replace("(\s)+", " ", $description); return $description; } diff --git a/lib/processors/image_processor.php b/lib/processors/image_processor.php index 90ca39a57..009e6917b 100755 --- a/lib/processors/image_processor.php +++ b/lib/processors/image_processor.php @@ -73,7 +73,7 @@ abstract class ImageProcessor implements CrawlConstants static function createThumb($image) { $thumb = imagecreatetruecolor(50, 50); - if( isset($image) ) { + if( isset($image) && $image == false ) { $size_x = imagesx($image); $size_y = imagesy($image); diff --git a/lib/processors/rtf_processor.php b/lib/processors/rtf_processor.php index 3430d588b..0e707c5be 100755 --- a/lib/processors/rtf_processor.php +++ b/lib/processors/rtf_processor.php @@ -54,9 +54,7 @@ class RtfProcessor extends TextProcessor * * @param string $page rtf string of a document * @param string $url location the document came from, not used by - * RTFProcessor at this point. Some of its subclasses override - * this method and use url to produce complete links for - * relative links within a document + * RTFProcessor at this point. * @return array a summary of (title, description,links, and content) of * the information in $page */ diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php index ad518840c..8e768c705 100755 --- a/lib/processors/text_processor.php +++ b/lib/processors/text_processor.php @@ -66,7 +66,7 @@ class TextProcessor implements CrawlConstants if(is_string($page)) { $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = mb_substr($page, 0, 400); - $summary[self::LINKS] = array(); + $summary[self::LINKS] = self::extractHttpHttpsUrls($page); $summary[self::PAGE] = "<html><body><pre>$page</pre></body></html>"; } return $summary; @@ -105,6 +105,27 @@ class TextProcessor implements CrawlConstants } + /** + * Tries to extract http or https links from a string of text. + * Does this by a very approximate regular expression. + * + * @param string $page text string of a document + * @return array a set of http or https links that were extracted from + * the document + */ + static function extractHttpHttpsUrls($page) + { + $pattern = + '@((http|https)://([^ \t\r\n\v\f\'\"\;\,\<\>\[\]\{\}\(\)])*)@i'; + $sites = array(); + preg_match_all($pattern, $page, $matches); + foreach($matches[0] as $url) { + if(!isset($sites[$url])) { + $sites[$url] = strip_tags($url); + } + } + return $sites; + } } ?> diff --git a/models/phrase_model.php b/models/phrase_model.php index 7fdc97161..eb0d38795 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -155,7 +155,7 @@ class PhraseModel extends Model $hashes[] = $tmp; } $hashes = array_merge($hashes, $hash_quoteds); - $restrict_phrases = array_merge($words, $quoteds); + $restrict_phrases = array_merge($query_words, $quoteds); $hashes = array_unique($hashes);