improve language detection for french, spanish, italian, tries to make changeDocumentOffsets more robost to lookup failures, a=chris

Chris Pollett [2019-06-07 03:Jun:th]
improve language detection for french, spanish, italian, tries to make changeDocumentOffsets more robost to lookup failures, a=chris
Filename
src/controllers/components/CrawlComponent.php
src/executables/Fetcher.php
src/library/LocaleFunctions.php
src/library/PhraseParser.php
src/library/WebArchive.php
src/library/WebArchiveBundle.php
src/library/processors/HtmlProcessor.php
src/library/summarizers/ScrapeSummarizer.php
src/library/summarizers/Summarizer.php
src/models/LocaleModel.php
src/models/ParallelModel.php
src/models/PhraseModel.php
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 25ff84f55..b1957e8fa 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1634,13 +1634,11 @@ class CrawlComponent extends Component implements CrawlConstants
                     $site[self::URL]);
                 $phrase_string = $host_words . " .. ".$site[self::TITLE] .
                     " ..  ". $path_words . " .. ". $site[self::DESCRIPTION];
-                if (!empty($site[self::TITLE])) {
-                    $lang = L\guessLocaleFromString($site[self::TITLE], $lang);
-                } else {
-                    $lang = L\guessLocaleFromString($site[self::DESCRIPTION],
+                if (empty($site[self::LANG])) {
+                    $lang = L\guessLocaleFromString($phrase_string,
                         $lang);
+                        $site[self::LANG] = $lang;
                 }
-                $site[self::LANG] = $lang;
                 $word_lists = PhraseParser::extractPhrasesInLists(
                     $phrase_string, $lang);
                 $len = strlen($phrase_string);
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 6837951b0..6164c600e 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2928,8 +2928,7 @@ class Fetcher implements CrawlConstants
                 }
                 if (empty($site[self::LANG])) {
                     $lang = L\guessLocaleFromString(
-                        mb_substr($site[self::DESCRIPTION], 0,
-                        C\AD_HOC_TITLE_LENGTH), C\DEFAULT_LOCALE);
+                        $site[self::DESCRIPTION]), C\DEFAULT_LOCALE);
                 } else {
                     $lang = $site[self::LANG];
                 }
diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php
index f977f3771..220423453 100755
--- a/src/library/LocaleFunctions.php
+++ b/src/library/LocaleFunctions.php
@@ -100,8 +100,7 @@ function guessLocale()
  */
 function guessLocaleFromString($phrase_string, $locale_tag = null)
 {
-    $original_phrase_string = mb_substr($phrase_string, 0,
-        C\AD_HOC_TITLE_LENGTH);
+    $original_phrase_string = $phrase_string;
     $locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag;
     $sub = C\PUNCT . "|[0-9]|\s";
     $phrase_string = preg_replace('/' . $sub . '/', "", $phrase_string);
@@ -153,7 +152,17 @@ function guessLocaleFromString($phrase_string, $locale_tag = null)
         }
     }
     if ($locale_tag == 'en-US') {
-        $locale_tag = checkQuery($original_phrase_string);
+        $len = strlen($original_phrase_string);
+        $locale_tag = 'en-US';
+        foreach (['en-US', 'fr-FR', 'es', 'it'] as $lang) {
+            $tokenizer = PhraseParser::getTokenizer($lang);
+            $test_len =
+                strlen($tokenizer->stopwordsRemover($original_phrase_string));
+            if ($test_len < $len) {
+                $len = $test_len;
+                $locale_tag = $lang;
+            }
+        }
     }
     return $locale_tag;
 }
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 2d9393010..f49eb242d 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -1371,7 +1371,7 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け
         static $unsafe_terms = [];
         /* took keywords from top level domains from some of theporndude list
          */
-        static $unsafe_host_regex = "/porn|xvideos|livejasmin|".
+        static $unsafe_url_regex = "/porn|xvideos|livejasmin|".
             "xhamster|bongacams|chaturbate|pussy|spankbang|".
             "xnxx|tnxx|beeg|daftsex|redtube|youjizz|vidz7|4tube|cumlouder|" .
             "tnaflix|xfantasy|vdiz24|luxuretv|perfectgirls|anysex|drtuber|" .
@@ -1399,8 +1399,8 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け
             "luscious\.net|hentai|freeones\.com|iafd|gayboystube|".
             "adam4adam|cams\.com|mrskin|adultwork|oglaf|streamate|".
             "nifty\.org|adultdvd|suicidegirls|ftvgirls|asstr|private\.com|".
-            "squirt\.org|fakku|faapy|fux\.com|txxx/i";
-        if (!empty($url) && preg_match($unsafe_host_regex, $url)) {
+            "squirt\.org|fakku|faapy|fux\.com|txxx|\Wnude\W/i";
+        if (!empty($url) && preg_match($unsafe_url_regex, $url)) {
             return 1;
         }
         if (count($word_lists) == 0) {
diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php
index 7248b7b9d..0d0ce8f10 100755
--- a/src/library/WebArchive.php
+++ b/src/library/WebArchive.php
@@ -118,7 +118,7 @@ class WebArchive
         } else {
             $this->iterator_pos = 0;
             $this->count = 0;
-            $fh =  fopen($this->filename, "w");
+            $fh = fopen($this->filename, "w");
             $this->writeInfoBlock($fh);
             fclose($fh);
         }
@@ -221,7 +221,7 @@ class WebArchive
      *     used to modify $data before it is written
      *     to the info block. For instance, we can add offset info to data.
      * @param bool $return_flag if true rather than adjust the offsets by
-     *     reference, create copy objects and adjust their offsets anf return
+     *     reference, create copy objects and adjust their offsets and return
      * @return mixed adjusted objects or void
      */
     public function addObjects($offset_field, &$objects,
diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php
index 38686c96e..a70e5a7ff 100755
--- a/src/library/WebArchiveBundle.php
+++ b/src/library/WebArchiveBundle.php
@@ -232,7 +232,7 @@ class WebArchiveBundle
      *
      * @param int $index the number of the partition within this bundle to
      *     return
-     * @param bool $fast_construct should the constructor of the WebArchive
+     * @param bool $fast_construct tells the constructor of the WebArchive
      *     avoid reading in its info block.
      * @return object the WebArchive file which was requested
      */
@@ -246,14 +246,24 @@ class WebArchiveBundle
             $create_flag = false;
             $compressor = C\NS_LIB . "compressors\\" . $this->compressor;
             $compressor_obj = new $compressor();
-            $archive_name = $this->dir_name."/web_archive_".$index
+            $archive_name = $this->dir_name."/web_archive_" . $index
                 . $compressor_obj->fileExtension();
             if (!file_exists($archive_name)) {
                 $create_flag = true;
             }
+            $archive_name_exists = file_exists($archive_name);
             $this->partition[$index] =
                 new WebArchive($archive_name,
                     new $compressor(), $fast_construct);
+            if (!$archive_name_exists) {
+                /* always add a dummy record so an offset 0 of a real record
+                   can never be legit. This is just to be on the safe side
+                   if a changeDocumentOffsets in IndexShard happens not to work
+                 */
+                $dummy_pages = [["DUMMY"]];
+                $this->partition[$index]->addObjects("DUMMY_OFFSET",
+                    $dummy_pages);
+            }
             if ($create_flag && file_exists($archive_name)) {
                 chmod($archive_name, 0777);
             }
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index 0cc9f138d..549df7bf3 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -120,7 +120,7 @@ class HtmlProcessor extends TextProcessor
                     $summary[self::TITLE] = self::crudeTitle($dom_page);
                 }
                 $summary[self::LANG] = self::lang($dom,
-                    $summary[self::TITLE], $url);
+                    strip_tags($page), $url);
                 $description_dom = $dom;
                 if (!empty($scraper)) {
                     $scrape_results = ScraperManager::applyScraperRules(
diff --git a/src/library/summarizers/ScrapeSummarizer.php b/src/library/summarizers/ScrapeSummarizer.php
index 8d4f085a0..279146bfa 100644
--- a/src/library/summarizers/ScrapeSummarizer.php
+++ b/src/library/summarizers/ScrapeSummarizer.php
@@ -100,7 +100,7 @@ class ScrapeSummarizer extends Summarizer
             }
             if ($pos > 0 && !empty($block_ranks[$changeable_index]) &&
                 $max_score/$score_pos < $block_ranks[$changeable_index]) {
-                $fixed_summary_len += strlen($block_ranks[$changeable_index]);
+                $fixed_summary_len += strlen($blocks[$changeable_index]);
                 $changeable_index++;
                 if ($fixed_summary_len > $max_summary_len) {
                     break;
diff --git a/src/library/summarizers/Summarizer.php b/src/library/summarizers/Summarizer.php
index 1113f998f..65020eca5 100644
--- a/src/library/summarizers/Summarizer.php
+++ b/src/library/summarizers/Summarizer.php
@@ -351,14 +351,15 @@ class Summarizer
         $summary = "";
         $summary_length = 0;
         $top = self::numSentencesForSummary($sentence_scores, $sentences);
-        if ($top <= 1) {
+        if ($top < 1) {
             if (!empty($sentences[0])) {
-                $summary = $sentences[0];
+                $summary = substr($sentences[0], 0,
+                    PageProcessor::$max_description_len);
                 return [ltrim($summary), [1]];
             }
         }
         $summary_indices = array_keys(array_slice($sentence_scores, 0,
-            $top - 1, true));
+            $top, true));
         sort($summary_indices);
         $eos = ($lang == 'hi') ? "।" : "."; //default end of sentence symbol
         $summary_scores = [];
diff --git a/src/models/LocaleModel.php b/src/models/LocaleModel.php
index 0bcdb345e..581bf0889 100755
--- a/src/models/LocaleModel.php
+++ b/src/models/LocaleModel.php
@@ -146,7 +146,7 @@ class LocaleModel extends Model
             the statistics text file contains info used to calculate
             what fraction of strings have been translated
          */
-        $tag_prefix = C\LOCALE_DIR."/".str_replace("-", "_",
+        $tag_prefix = C\LOCALE_DIR . "/" . str_replace("-", "_",
             $locale['LOCALE_TAG']);
         if (!file_exists($tag_prefix)) {
             mkdir($tag_prefix); //create locale_dirs that are missing
@@ -386,8 +386,8 @@ class LocaleModel extends Model
     {
         $sql = "DELETE FROM LOCALE WHERE LOCALE_TAG = ?";
         $this->db->execute($sql, [$locale_tag]);
-        if (file_exists(C\LOCALE_DIR."/$locale_tag")) {
-            $this->db->unlinkRecursive(C\LOCALE_DIR."/$locale_tag", true);
+        if (file_exists(C\LOCALE_DIR . "/$locale_tag")) {
+            $this->db->unlinkRecursive(C\LOCALE_DIR . "/$locale_tag", true);
         }
     }
     /**
diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php
index d69873529..1b97675d4 100755
--- a/src/models/ParallelModel.php
+++ b/src/models/ParallelModel.php
@@ -292,6 +292,16 @@ class ParallelModel extends Model
                         $index = IndexManager::getIndex($index_name);
                         if (is_integer($summary_offset) &&
                             is_integer($generation)) {
+                            if ($summary_offset == 0) {
+                                /*
+                                   we insert a dummy object at start of each
+                                   partition, so no legit lookup should have
+                                   offset 0. It still might happen, if a
+                                   changeDocumentOffsets failed, so we add
+                                   this check
+                                 */
+                                continue;
+                            }
                             $page = $index->getPage($summary_offset,
                                 $generation);
                         } else {
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 33210d6f7..c782e9809 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -895,7 +895,7 @@ class PhraseModel extends ParallelModel
         $matches = $matches[2];
         $result_phrase = preg_replace($pattern, "", $phrase);
         foreach ($matches as $match) {
-            $tag = L\guessLocaleFromString($match, $lang_tag, 10);
+            $tag = L\guessLocaleFromString($match, $lang_tag);
             $not_check = true;
             foreach ($not_contains as $not_contain) {
                 if (strstr($match, $not_contain)) {
@@ -940,7 +940,7 @@ class PhraseModel extends ParallelModel
         $matches = $matches[2];
         $result_phrase = preg_replace($pattern, " ", $phrase);
         foreach ($matches as $match) {
-            $tag = L\guessLocaleFromString($match, $lang_tag, 10);
+            $tag = L\guessLocaleFromString($match, $lang_tag);
             $not_check = true;
             foreach ($not_contains as $not_contain) {
                 if (strstr($match, $not_contain)) {
ViewGit