Improves handling of cache highlighting, undomable html now processed text processor as backup, a=chris

Chris Pollett [2010-09-18 21:Sep:th]

Improves handling of cache highlighting, undomable html now processed text processor as backup, a=chris

Filename
bin/fetcher.php
controllers/search_controller.php
lib/index_shard.php
lib/processors/html_processor.php
lib/processors/text_processor.php
lib/string_array.php
models/model.php
models/phrase_model.php
views/search_view.php

diff --git a/bin/fetcher.php b/bin/fetcher.php
index ffc8a3140..75a2c2624 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -596,7 +596,7 @@ class Fetcher implements CrawlConstants
                 $not_downloaded[] = $site;
             } else {
                 $duplicates[] = $site[self::URL];
-                echo "Deduplicated:".$site[self::URL]."\n";
+                crawlLog("Deduplicated:".$site[self::URL]);
             }

         }
@@ -1064,10 +1064,12 @@ class Fetcher implements CrawlConstants
             $description_length = $info[self::DESCRIPTION_LENGTH];
             $link_length = $info[self::LINK_LENGTH];

-            $title_ratio = $title_length/$average_title_length;
-            $description_ratio =
-                $description_length/$average_description_length;
-            $link_ratio = $link_length/$average_total_link_text_length;
+            $title_ratio = ($average_title_length > 0) ?
+                $title_length/$average_title_length : 0;
+            $description_ratio = ($average_description_length > 0) ?
+                $description_length/$average_description_length :0;
+            $link_ratio = ($average_total_link_text_length > 0) ?
+                $link_length/$average_total_link_text_length : 0;

             if(isset($info[self::TITLE_WORDS])) {
                 foreach($info[self::TITLE_WORDS]
diff --git a/controllers/search_controller.php b/controllers/search_controller.php
index 0328bf634..8b027f373 100755
--- a/controllers/search_controller.php
+++ b/controllers/search_controller.php
@@ -376,11 +376,19 @@ class SearchController extends Controller implements CrawlConstants

         $dom = new DOMDocument();

-        @$dom->loadHTML($cache_file);
+        $did_dom = @$dom->loadHTML($cache_file);
+
         $xpath = new DOMXPath($dom);


         $body =  $dom->getElementsByTagName('body')->item(0);
+        if($body == false) {
+            $cache_file = "<html><head><title>Yioop! Cache</title></head>".
+                "<body>".htmlentities($cache_file)."</body></html>";
+            $dom = new DOMDocument();
+            @$dom->loadHTML($cache_file);
+            $body =  $dom->getElementsByTagName('body')->item(0);
+        }
         $first_child = $body->firstChild;

         $divNode = $dom->createElement('div');
@@ -402,13 +410,19 @@ class SearchController extends Controller implements CrawlConstants

         $i = 0;
         foreach($words as $word) {
-            if(strlen($word) > 0) {
-            $match = crawlHash($word).$word;
-            $newDoc = preg_replace("/$match/i",
-                '<span style="background-color:'.
-                $colors[$i].'">$0</span>', $newDoc);
-            $i = ($i + 1) % $color_count;
-            $newDoc = preg_replace("/".crawlHash($word)."/", "", $newDoc);
+            //only mark string of length at least 2
+            if(strlen($word) > 1) {
+                $mark_prefix = crawlHash($word);
+                if(stristr($mark_prefix, $word) !== false) {
+                    $mark_prefix = preg_replace(
+                    "/$word/i", '', $mark_prefix);
+                }
+                $match = $mark_prefix.$word;
+                $newDoc = preg_replace("/$match/i",
+                    '<span style="background-color:'.
+                    $colors[$i].'">$0</span>', $newDoc);
+                $i = ($i + 1) % $color_count;
+                $newDoc = preg_replace("/".$mark_prefix."/", "", $newDoc);
             }
         }

@@ -442,9 +456,15 @@ class SearchController extends Controller implements CrawlConstants
                 $text = $clone->textContent;

                 foreach($words as $word) {
-                    if(strlen($word) > 0) {
+                    //only mark string of length at least 2
+                    if(strlen($word) > 1) {
+                        $mark_prefix = crawlHash($word);
+                        if(stristr($mark_prefix, $word) !== false) {
+                            $mark_prefix = preg_replace(
+                            "/$word/i", '', $mark_prefix);
+                        }
                         $text = preg_replace(
-                            "/$word/i", crawlHash($word).'$0', $text);
+                            "/$word/i", $mark_prefix.'$0', $text);
                     }
                 }

diff --git a/lib/index_shard.php b/lib/index_shard.php
index 03447bee1..e903bc5b1 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -43,4 +43,49 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 class IndexShard extends PersistentStructure implements Serializable
 {
+    var $doc_ids;
+    var $word_docs;
+    var $count_doc256;
+
+    function __construct()
+    {
+    }
+
+    function addDocumentWords($doc_id, $word_id_array)
+    {
+        $this->doc_ids[] = $doc_id;
+
+        foreach($word_id_arr as $word_id => $relevance) {
+            $relevance = $relevance & 255;
+            $store = pack("N", $this->count_doc256 + $relevance);
+            $this->word_docs[$word_id] .= $store;
+        }
+
+        $this->count_doc256 += 256;
+    }
+
+    function getWordSlice($word_id, $start, $len)
+    {
+        $result = array();
+        if(isset($word_docs[$word_id])) {
+            $docs_string = substr($word_docs[$word_id], $start << 2, $len <<2);
+            //check if got at least one item
+            if($docs_string !== false && ($doc_len = strlen($doc_string)) > 3) {
+                for($i = 0; $i < $doc_len; $i += 4) {
+                }
+            }
+        }
+
+        return $result;
+    }
+
+    function appendIndexShard($index_shard)
+    {
+    }
+
+    function docCount()
+    {
+        return ($this->count_doc256 >> 8);
+    }
+
 }
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index 4dbc73758..9d2846fe7 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -68,19 +68,24 @@ class HtmlProcessor extends TextProcessor
      */
     public static function process($page, $url)
     {
+        $summary = NULL;
         if(is_string($page)) {
             $dom = self::dom($page);

-            if(self::checkMetaRobots($dom)) {
+            if($dom !==false && self::checkMetaRobots($dom)) {
                 $summary[self::TITLE] = self::title($dom);
                 $summary[self::DESCRIPTION] = self::description($dom);
                 $summary[self::LINKS] = self::links($dom, $url);

-                return $summary;
+                if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
+                    == 0 && count($summary[self::LINKS]) == 0) {
+                    //maybe not html? treat as text still try to get urls
+                    $summary = parent::process($page, url);
+                }
             }
         }

-        return NULL;
+        return $summary;

     }

diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index c98580447..e9714da78 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -67,7 +67,8 @@ class TextProcessor implements CrawlConstants
             $summary[self::TITLE] = "";
             $summary[self::DESCRIPTION] = mb_substr($page, 0, 400);
             $summary[self::LINKS] = self::extractHttpHttpsUrls($page);
-            $summary[self::PAGE] = "<html><body><pre>$page</pre></body></html>";
+            $summary[self::PAGE] = "<html><body><pre>".
+                strip_tags($page)."</pre></body></html>";
         }
         return $summary;
     }
diff --git a/lib/string_array.php b/lib/string_array.php
index 831ea4dd8..6d830f865 100755
--- a/lib/string_array.php
+++ b/lib/string_array.php
@@ -41,7 +41,9 @@ require_once "persistent_structure.php";
 /**
  * Memory efficient implementation of persistent arrays
  *
- * The standard array ob
+ * The standard array objects in php and even spl have a large amount of
+ * overhead. The point of this class is to have the size as close to the
+ * optimal as possible
  *
  * @author Chris Pollett
  *
diff --git a/models/model.php b/models/model.php
index d82d5c31c..426d3b9c5 100755
--- a/models/model.php
+++ b/models/model.php
@@ -132,6 +132,10 @@ class Model implements CrawlConstants
                 $page[self::TITLE] =
                     substr(strip_tags($page[self::DESCRIPTION]), 0, $end_title).
                     $ellipsis;
+                //still no text revert to url
+                if(strlen($page[self::TITLE]) == 0) {
+                    $page[self::TITLE] = $page[self::URL];
+                }
             }


diff --git a/models/phrase_model.php b/models/phrase_model.php
index e6d2d6db0..b92584645 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -137,6 +137,14 @@ class PhraseModel extends Model

     }

+    /**
+     * Determines the offset into the summaries WebArchiveBundle of the
+     * provided url so that it can be retrieved. This relies on the
+     * fact that the info:url meta word has been stored.
+     *
+     * @param string $url what to lookup
+     * @return int offset into the web archive bundle
+     */
     function lookupSummaryOffset($url)
     {
         $index_archive_name = self::index_data_base_name . $this->index_name;
@@ -160,6 +168,15 @@ class PhraseModel extends Model
         return $summary_offset;
     }

+    /**
+     *  Parses from a string phrase representing a conjunctive query, a struct
+     *  consisting of the words keys searched for, the allowed and disallowed
+     *  phrases, the weight that should be put on these query results, and
+     *  which archive to use.
+     *
+     * @param string $phrase string to extract struct from
+     * @return array struct representing the conjunctive query
+     */
     function parseWordStructConjunctiveQuery($phrase)
     {
         $phrase = " ".$phrase;
diff --git a/views/search_view.php b/views/search_view.php
index 9be7cc805..71cadab4b 100755
--- a/views/search_view.php
+++ b/views/search_view.php
@@ -115,9 +115,10 @@ class SearchView extends View implements CrawlConstants
                 <div class='result'>
                 <h2>
                 <a href="<?php if($page[self::TYPE] != "link") {
-                    e($page[self::URL]);
-                    } else
-                    e(strip_tags($page[self::TITLE])); ?>" ><?php
+                        e($page[self::URL]);
+                    } else {
+                        e(strip_tags($page[self::TITLE]));
+                    } ?>" ><?php
                  if(isset($page[self::THUMB]) && $page[self::THUMB] != 'NULL') {
                     ?><img src="<?php e($page[self::THUMB]); ?>" alt="<?php
                         e($page[self::TITLE]); ?>"  /> <?php

ViewGit