Fixes Bug7, a=chris

Chris Pollett [2010-08-17 20:Aug:th]

Fixes Bug7, a=chris

Filename
bin/queue_server.php
lib/index_archive_bundle.php
lib/processors/html_processor.php
lib/processors/image_processor.php
lib/processors/rtf_processor.php
lib/processors/text_processor.php
models/phrase_model.php

diff --git a/bin/queue_server.php b/bin/queue_server.php
index 3f6e7f497..44d897579 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -574,7 +574,7 @@ class QueueServer implements CrawlConstants
             crawlLog("... less than max age\n");
         }

-        crawlLog("Checking for Robot.txt files to process...");
+        crawlLog("Checking for robots.txt files to process...");
         $robot_dir =
             CRAWL_DIR."/schedules/".
                 self::robot_data_base_name.$this->crawl_time;
@@ -1002,7 +1002,7 @@ class QueueServer implements CrawlConstants
                 } //if delay else
             } // if containsGotRobotTxt

-            // handle robot.txt urls
+            // handle robots.txt urls


             $i++;
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 0b759e982..aeefd4967 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -276,7 +276,8 @@ class WordIterator implements IndexingConstants, CrawlConstants
                 $this->index->num_partitions_index);

         if($info_block == NULL) {
-        	    $this->info_block = $this->index->getPhraseIndexInfo($this->word_key);
+        	    $this->info_block =
+        	        $this->index->getPhraseIndexInfo($this->word_key);
         } else {
             $this->info_block = $info_block;
         }
@@ -375,6 +376,10 @@ class WordIterator implements IndexingConstants, CrawlConstants
      */
     public function currentDocsWithWord($restrict_phrases = NULL)
     {
+        if($this->num_generations <=
+            $this->info_block['CURRENT_GENERATION_INDEX']) {
+            return -1;
+        }
         $generation =
             $this->info_block['GENERATIONS'][
                 $this->info_block['CURRENT_GENERATION_INDEX']];
@@ -958,7 +963,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

         if($phrase_info == NULL || (isset($phrase_info[self::PARTIAL_COUNT])
             && $phrase_info[self::PARTIAL_COUNT] < $limit + $num)) {
-
             $this->addPhraseIndex(
                 $word_key, $restrict_phrases, $phrase_key, $limit + $num);
         }
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index ec7930e8e..bc27150f3 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -52,6 +52,8 @@ require_once BASE_DIR."/lib/url_parser.php";
  */
 class HtmlProcessor extends TextProcessor
 {
+    const MAX_DESCRIPTION_LEN = 3000;
+

     /**
      *  Used to extract the title, description and links from
@@ -136,7 +138,7 @@ class HtmlProcessor extends TextProcessor
         $sites = array();

         $xpath = new DOMXPath($dom);
-        $titles = $xpath->evaluate("/html/head//title");
+        $titles = $xpath->evaluate("/html//title");

         $title = "";

@@ -158,7 +160,8 @@ class HtmlProcessor extends TextProcessor
         $sites = array();

         $xpath = new DOMXPath($dom);
-        $metas = $xpath->evaluate("/html/head//meta");
+
+        $metas = $xpath->evaluate("/html//meta");

         $description = "";

@@ -169,14 +172,22 @@ class HtmlProcessor extends TextProcessor
             }
         }

-        //concatenate the contents of all the h1, h2 tags in the document
-        $headings = $xpath->evaluate(
-            "/html/body//h1|/html/body//h2|/html/body//h3|/html/body//p[1]");
-
-        foreach($headings as $h) {
-            $description .= " ".$h->textContent;
+        /*
+          concatenate the contents of then additional dom elements up to
+          the limit of description length
+        */
+        $page_parts = array("/html//h1", "/html//h2", "/html//h3",
+            "/html//h4", "/html//h5", "/html//h6", "/html//p[1]",
+            "/html//div[1]", "/html//p[2]", "/html//div[2]",
+            "/html//td");
+        foreach($page_parts as $part) {
+            $doc_nodes = $xpath->evaluate($part);
+            foreach($doc_nodes as $node) {
+                $description .= " ".$node->textContent;
+                if(strlen($description) > self::MAX_DESCRIPTION_LEN) { break 2;}
+            }
         }
-        $description = mb_ereg_replace("(\s)+", " ",  $description);
+        $description = mb_ereg_replace("(\s)+", " ",  $description);

         return $description;
     }
diff --git a/lib/processors/image_processor.php b/lib/processors/image_processor.php
index 90ca39a57..009e6917b 100755
--- a/lib/processors/image_processor.php
+++ b/lib/processors/image_processor.php
@@ -73,7 +73,7 @@ abstract class ImageProcessor implements CrawlConstants
     static function createThumb($image)
     {
         $thumb = imagecreatetruecolor(50, 50);
-        if( isset($image)  ) {
+        if( isset($image) && $image == false ) {
             $size_x = imagesx($image);
             $size_y = imagesy($image);

diff --git a/lib/processors/rtf_processor.php b/lib/processors/rtf_processor.php
index 3430d588b..0e707c5be 100755
--- a/lib/processors/rtf_processor.php
+++ b/lib/processors/rtf_processor.php
@@ -54,9 +54,7 @@ class RtfProcessor extends TextProcessor
      *
      * @param string $page rtf string of a document
      * @param string $url location the document came from, not used by
-     *      RTFProcessor at this point. Some of its subclasses override
-     *      this method and use url to produce complete links for
-     *      relative links within a document
+     *      RTFProcessor at this point.
      * @return array a summary of (title, description,links, and content) of
      *      the information in $page
      */
diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index ad518840c..8e768c705 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -66,7 +66,7 @@ class TextProcessor implements CrawlConstants
         if(is_string($page)) {
             $summary[self::TITLE] = "";
             $summary[self::DESCRIPTION] = mb_substr($page, 0, 400);
-            $summary[self::LINKS] = array();
+            $summary[self::LINKS] = self::extractHttpHttpsUrls($page);
             $summary[self::PAGE] = "<html><body><pre>$page</pre></body></html>";
         }
         return $summary;
@@ -105,6 +105,27 @@ class TextProcessor implements CrawlConstants

     }

+    /**
+     * Tries to extract http or https links from a string of text.
+     * Does this by a very approximate regular expression.
+     *
+     * @param string $page text string of a document
+     * @return array a set of http or https links that were extracted from
+     *      the document
+     */
+    static function extractHttpHttpsUrls($page)
+    {
+        $pattern =
+            '@((http|https)://([^ \t\r\n\v\f\'\"\;\,\<\>\[\]\{\}\(\)])*)@i';
+        $sites = array();
+        preg_match_all($pattern, $page, $matches);
+        foreach($matches[0] as $url) {
+            if(!isset($sites[$url])) {
+                $sites[$url] = strip_tags($url);
+            }
+        }
+        return $sites;
+    }
 }

 ?>
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 7fdc97161..eb0d38795 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -155,7 +155,7 @@ class PhraseModel extends Model
                 $hashes[] = $tmp;
             }
             $hashes = array_merge($hashes, $hash_quoteds);
-            $restrict_phrases = array_merge($words, $quoteds);
+            $restrict_phrases = array_merge($query_words, $quoteds);


             $hashes = array_unique($hashes);

ViewGit