Fixes a bug in page language detection, a=chris

Chris Pollett [2011-01-24 18:Jan:th]

Fixes a bug in page language detection, a=chris

Filename
bin/fetcher.php
lib/fetch_url.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/intersect_iterator.php
lib/processors/html_processor.php
lib/processors/rss_processor.php
lib/processors/text_processor.php

diff --git a/bin/fetcher.php b/bin/fetcher.php
index 362f65fbd..22cd013bc 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -1255,7 +1255,7 @@ class Fetcher implements CrawlConstants
                     $this->found_sites[self::SEEN_URLS][] = $summary;

                     $link_text =
-                        mb_ereg_replace("[[:punct:]]", " ", $link_text);
+                        mb_ereg_replace(PUNCT, " ", $link_text);
                     $link_word_counts =
                         PhraseParser::extractPhrasesAndCount($link_text);
                     $link_shard->addDocumentWords($link_keys,
@@ -1282,7 +1282,6 @@ class Fetcher implements CrawlConstants
         crawlLog("  Build mini inverted index time ".
             (changeInMicrotime($start_time)));
     }
-}

     /**
      * Calculates the meta words to be associated with a given downloaded
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index e53fda044..e2772275b 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -254,7 +254,7 @@ class FetchUrl implements CrawlConstants
             $line = trim($line);
             if(stristr($line, 'Server:')) {
                 $server_parts = explode("Server:", $line);
-                $server_name_parts = explode("/", $server_parts[1]);
+                $server_name_parts = @explode("/", $server_parts[1]);
                 $site[CrawlConstants::SERVER] = @trim($server_name_parts[0]);
                 if(isset($server_name_parts[1])) {
                     $version_parts = explode("(", $server_name_parts[1]);
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 8f2669f25..fce41deab 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -202,7 +202,6 @@ class GroupIterator extends IndexBundleIterator
         $done = false;
         do {
             $new_pages = $this->index_bundle_iterator->currentDocsWithWord();
-
             if(!is_array($new_pages)) {
                 $done = true;
                 if(count($pages) == 0) {
@@ -218,6 +217,7 @@ class GroupIterator extends IndexBundleIterator
                 $done = true;
             }
         } while(!$done);
+
         return $pages;
     }

diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index d68f5d151..9e2a63193 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -96,9 +96,9 @@ class IntersectIterator extends IndexBundleIterator
         $this->results_per_block = 1;

         /*
-             the most results we can return is the size of the least num_docs
-             of what we are iterating over. We are also setting up here
-             that we return at most one posting at a time from each
+             We take an initial guess of the num_docs we returns as the sum
+             of the num_docs of the underlying iterators. We are also setting
+             up here that we return at most one posting at a time from each
              iterator
         */
         for($i = 0; $i < $this->num_iterators; $i++) {
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index 73f45c9c4..65e444193 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -71,10 +71,11 @@ class HtmlProcessor extends TextProcessor
         $summary = NULL;
         if(is_string($page)) {
             $dom = self::dom($page);
-            if($dom !==false && self::checkMetaRobots($dom)) {
-                $summary[self::LANG] = self::lang($dom);
+            if($dom !== false && self::checkMetaRobots($dom)) {
                 $summary[self::TITLE] = self::title($dom);
-                $summary[self::DESCRIPTION] = self::description($dom);
+                $summary[self::DESCRIPTION] = self::description($dom);
+                $summary[self::LANG] = self::lang($dom,
+                    $summary[self::DESCRIPTION]);
                 $summary[self::LINKS] = self::links($dom, $url);
                 $summary[self::PAGE] = $page;
                 if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
@@ -133,19 +134,37 @@ class HtmlProcessor extends TextProcessor

     /**
      *  Determines the language of the html document by looking at the root
-     *  language attribute
+     *  language attribute. If that fails $sample_text is used to try to guess
+     *  the language
+     *
+     *  @param object $dom  a document object to check the language of
+     *  @param string $sample_text sample text to try guess the language from
      *
-     *  @param object $dom - a document object to check the language of
-     *
      *  @return string language tag for guessed language
-
      */
-    static function lang($dom)
+    static function lang($dom, $sample_text = NULL)
     {
         $xpath = new DOMXPath($dom);
         $html = $xpath->evaluate("/html");
+        $lang = NULL;
         if(is_object($html->item(0))) {
             $lang = $html->item(0)->getAttribute('lang');
+        }
+        if($lang == NULL && $sample_text != NULL){
+            $words = mb_split("[[:space:]]|".PUNCT, $sample_text);
+            $num_words = count($words);
+            $ascii_count = 0;
+            foreach($words as $word) {
+                if(strlen($word) == mb_strlen($word)) {
+                    $ascii_count++;
+                }
+            }
+            // crude, but let's guess ASCII == english
+            if($ascii_count/$num_words > 0.9) {
+                $lang = 'en';
+            } else {
+                $lang = NULL;
+            }
         } else {
             $lang = NULL;
         }
diff --git a/lib/processors/rss_processor.php b/lib/processors/rss_processor.php
index 0ea3481b4..78f3d7622 100644
--- a/lib/processors/rss_processor.php
+++ b/lib/processors/rss_processor.php
@@ -73,11 +73,11 @@ class RssProcessor extends TextProcessor
             $dom = self::dom($page);

             if($dom !==false) {
-                $summary[self::LANG] = self::lang($dom);
                 $summary[self::TITLE] = self::title($dom);
                 $summary[self::DESCRIPTION] = self::description($dom);
+                $summary[self::LANG] = self::lang($dom,
+                    $summary[self::DESCRIPTION]);
                 $summary[self::LINKS] = self::links($dom, $url);
-
                 if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
                     == 0 && count($summary[self::LINKS]) == 0) {
                     //maybe not rss? treat as text still try to get urls
@@ -94,18 +94,35 @@ class RssProcessor extends TextProcessor
      *  language tag
      *
      *  @param object $dom - a document object to check the language of
-     *
+     *  @param string $sample_text sample text to try guess the language from
+     *
      *  @return string language tag for guessed language
-
      */
-    static function lang($dom)
+    static function lang($dom, $sample_text = NULL)
     {
         $xpath = new DOMXPath($dom);
         $languages = $xpath->evaluate("/rss/channel/language");
         if($languages && is_object($languages)) {
             return $languages->item(0)->textContent;
+        } else if($sample_text != NULL){
+            $words = mb_split("[[:space:]]|".PUNCT, $sample_text);
+            $num_words = count($words);
+            $ascii_count = 0;
+            foreach($words as $word) {
+                if(strlen($word) == mb_strlen($word)) {
+                    $ascii_count++;
+                }
+            }
+            // crude, but let's guess ASCII == english
+            if($ascii_count/$num_words > 0.9) {
+                $lang = 'en';
+            } else {
+                $lang = NULL;
+            }
+        } else {
+            $lang = NULL;
         }
-        return NULL;
+        return $lang;
     }

     /**
diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index 8daa72be5..a835e0f91 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -66,6 +66,8 @@ class TextProcessor implements CrawlConstants
         if(is_string($page)) {
             $summary[self::TITLE] = "";
             $summary[self::DESCRIPTION] = mb_substr($page, 0, 400);
+            $summary[self::LANG] = self::calculateLang(
+                $summary[self::DESCRIPTION]);
             $summary[self::LINKS] = self::extractHttpHttpsUrls($page);
             $summary[self::PAGE] = "<html><body><pre>".
                 strip_tags($page)."</pre></body></html>";
@@ -73,6 +75,38 @@ class TextProcessor implements CrawlConstants
         return $summary;
     }

+
+    /**
+     *  Tries to determine the language of the document by looking at the
+     *  $sample_text provided
+     *  the language
+     *  @param string $sample_text sample text to try guess the language from
+     *
+     *  @return string language tag for guessed language
+     */
+    static function calculateLang($sample_text = NULL)
+    {
+        if($sample_text != NULL){
+            $words = mb_split("[[:space:]]|".PUNCT, $sample_text);
+            $num_words = count($words);
+            $ascii_count = 0;
+            foreach($words as $word) {
+                if(strlen($word) == mb_strlen($word)) {
+                    $ascii_count++;
+                }
+            }
+            // crude, but let's guess ASCII == english
+            if($ascii_count/$num_words > 0.9) {
+                $lang = 'en';
+            } else {
+                $lang = NULL;
+            }
+        } else {
+            $lang = NULL;
+        }
+        return $lang;
+    }
+
     /**
      * Gets the text between two tags in a document starting at the current
      * position.

ViewGit