Fix bugs in Summarizers to make sure always can output sentences with punctuation, fix ug in english tokenizer when lexicon path was incorrect, a=chris

Chris Pollett [2018-05-17 01:May:th]

Fix bugs in Summarizers to make sure always can output sentences with punctuation, fix ug in english tokenizer when lexicon path was incorrect, a=chris

Filename
src/controllers/components/CrawlComponent.php
src/executables/ArcTool.php
src/executables/Fetcher.php
src/library/PhraseParser.php
src/library/UrlParser.php
src/library/indexing_plugins/AddressesPlugin.php
src/library/indexing_plugins/IndexingPlugin.php
src/library/indexing_plugins/RecipePlugin.php
src/library/processors/HtmlProcessor.php
src/library/summarizers/CentroidSummarizer.php
src/library/summarizers/ScrapeSummarizer.php
src/locale/en_US/resources/Tokenizer.php
tests/HiTokenizerTest.php

diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 99d9f7802..9b26b2d29 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1518,7 +1518,7 @@ class CrawlComponent extends Component implements CrawlConstants
             }
             $meta_ids = PhraseParser::calculateMetas($site);
             if (!$site[self::JUST_METAS]) {
-                $host_words = UrlParser::getWordsIfHostUrl($site[self::URL]);
+                $host_words = UrlParser::getWordsInHostUrl($site[self::URL]);
                 $path_words = UrlParser::getWordsLastPathPartUrl(
                     $site[self::URL]);
                 $phrase_string = $host_words." .. ".$site[self::TITLE] .
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 80f424aef..f098830af 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -911,7 +911,7 @@ class ArcTool implements CrawlConstants
                      */
                     $lang = null;
                     if (!isset($site[self::JUST_METAS])) {
-                        $host_words = UrlParser::getWordsIfHostUrl($site_url);
+                        $host_words = UrlParser::getWordsInHostUrl($site_url);
                         $path_words = UrlParser::getWordsLastPathPartUrl(
                             $site_url);
                         if ($is_link) {
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index fd81f39cb..346e50176 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2779,7 +2779,7 @@ class Fetcher implements CrawlConstants
              */
             $lang = null;
             if (!isset($site[self::JUST_METAS])) {
-                $host_words = UrlParser::getWordsIfHostUrl($site_url);
+                $host_words = UrlParser::getWordsInHostUrl($site_url);
                 $path_words = UrlParser::getWordsLastPathPartUrl(
                     $site_url);
                 if ($is_link) {
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 63177a66d..46efbdaa8 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -361,10 +361,16 @@ class PhraseParser
             "tagTokenizePartOfSpeech") &&
             !isset(self::$programming_language_map[$lang])) {
             $string = mb_strtolower($string);
-            $sentences = preg_split("/(\n\n+)|\.|\!|\?|。/u", $string);
+            $pre_sentences = preg_split("/(\n\n+)|\.|\!|\?|。/u", $string);
             $pos = 0;
             $sentences_pos = [];
-            $sentences = array_filter($sentences);
+            $sentences = [];
+            foreach ($pre_sentences as $pre_sentence) {
+                $pre_sentence = trim($pre_sentence);
+                if (!empty($pre_sentence)) {
+                    $sentences[] = $pre_sentence;
+                }
+            }
             foreach ($sentences as $sentence) {
                 if (empty($sentences_pos[$sentence])) {
                     $sentences_pos[$sentence] = [$pos];
@@ -404,7 +410,7 @@ class PhraseParser
             if ($lang == "hi") {
                 $string = preg_replace('/(,:)\p{P}/u', "", $string);
             }
-            $string = mb_ereg_replace("\s+|".C\PUNCT, " ", $string);
+            $string = mb_ereg_replace("\s+|" . C\PUNCT, " ", $string);
             $terms = self::segmentSegment($string, $lang);
             $terms = self::charGramTerms($terms, $lang);
             $terms = self::stemTerms($terms, $lang);
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 2ab932772..88939871e 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -482,7 +482,7 @@ class UrlParser
      * @return string space separated words extracted.
      *
      */
-    public static function getWordsIfHostUrl($url)
+    public static function getWordsInHostUrl($url)
     {
         $words = [];
         $url_parts = @parse_url($url);
diff --git a/src/library/indexing_plugins/AddressesPlugin.php b/src/library/indexing_plugins/AddressesPlugin.php
index 9e2b5cd4d..def10ed56 100644
--- a/src/library/indexing_plugins/AddressesPlugin.php
+++ b/src/library/indexing_plugins/AddressesPlugin.php
@@ -532,7 +532,7 @@ class AddressesPlugin extends IndexingPlugin implements CrawlConstants
     public function parsePhones($line)
     {
         $phones = [];
-        $line = preg_replace('/('.C\PUNCT.'|\s)+/',"", $line);
+        $line = preg_replace('/(' . C\PUNCT . '|\s)+/',"", $line);
         $phone_keywords = "/sales|mobile|phone|call|电话|電話|fono|fone|".
             "fon|foon|전화|φωνο|фон/ui";
         $phone_parts = preg_split($phone_keywords, $line);
diff --git a/src/library/indexing_plugins/IndexingPlugin.php b/src/library/indexing_plugins/IndexingPlugin.php
index fc831851e..2a018bdb1 100644
--- a/src/library/indexing_plugins/IndexingPlugin.php
+++ b/src/library/indexing_plugins/IndexingPlugin.php
@@ -26,7 +26,7 @@
  * @author Priya Gangaraju priya.gangaraju@gmail.com, Chris Pollett
  * @license https://www.gnu.org/licenses/ GPL3
  * @link https://www.seekquarry.com/
- * @copyright 2011 - 2014
+ * @copyright 2011 - 2018
  * @filesource
  */
 namespace seekquarry\yioop\library\indexing_plugins;
diff --git a/src/library/indexing_plugins/RecipePlugin.php b/src/library/indexing_plugins/RecipePlugin.php
index 63cc4ce78..30e84a4fe 100644
--- a/src/library/indexing_plugins/RecipePlugin.php
+++ b/src/library/indexing_plugins/RecipePlugin.php
@@ -27,7 +27,7 @@
  *     chris@pollett.org
  * @license https://www.gnu.org/licenses/ GPL3
  * @link https://www.seekquarry.com/
- * @copyright 2011 -2017
+ * @copyright 2011 - 2018
  * @filesource
  */
 namespace seekquarry\yioop\library\indexing_plugins;
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index a980a76ef..59c1be909 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -377,7 +377,7 @@ class HtmlProcessor extends TextProcessor
         //look for a meta tag with a description
         foreach ($metas as $meta) {
             if (stristr($meta->getAttribute('name'), "description")) {
-                $description .= " .. ".$meta->getAttribute('content');
+                $description .= " .. " . $meta->getAttribute('content');
             }
         }
         if (self::$max_description_len > 2 * C\MAX_DESCRIPTION_LEN) {
@@ -433,7 +433,6 @@ class HtmlProcessor extends TextProcessor
             if ($first_len > 3 * $add_len) break;
         }
         $description = preg_replace("/(\s)+/u", " ",  $description);
-
         return $description;
     }
     /**
diff --git a/src/library/summarizers/CentroidSummarizer.php b/src/library/summarizers/CentroidSummarizer.php
index 8fc24adb1..fc743a7b7 100644
--- a/src/library/summarizers/CentroidSummarizer.php
+++ b/src/library/summarizers/CentroidSummarizer.php
@@ -180,12 +180,12 @@ class CentroidSummarizer extends Summarizer
             }
             $i++;
         }
-        if (strlen($formatted_doc) < PageProcessor::$max_description_len
+        if (strlen($page) < PageProcessor::$max_description_len
             || $n == 1) {
             //if input short only use above to get a word cloud
-            $formatted_doc = substr($formatted_doc, 0,
+            $page = substr($page, 0,
                 PageProcessor::$max_description_len);
-            return [$formatted_doc, $word_cloud];
+            return [$page, $word_cloud];
         }
         ksort($wc);
         /* Calculate similarity measure between centroid and each sentence */
@@ -335,7 +335,6 @@ class CentroidSummarizer extends Summarizer
             '/\[(.*?)\]/', '/\t\n/'
         ];
         $page = preg_replace($substitutions, ' ', $page);
-        $page = preg_replace('/\s{2,}/', ' ', $page);
         $new_page = preg_replace("/\<br\s*(\/)?\s*\>/", "\n", $page);
         $changed = false;
         if ($new_page != $page) {
@@ -348,12 +347,12 @@ class CentroidSummarizer extends Summarizer
         $page = preg_replace("/\&\#\d{3}(\d?)\;|\&\w+\;/", " ", $page);
         $page = preg_replace("/\</", " <", $page);
         $page = strip_tags($page);
-
         if ($changed) {
             $page = preg_replace("/(\r?\n[\t| ]*){2}/", "\n", $page);
         }
         $page = preg_replace("/(\r?\n[\t| ]*)/", "\n", $page);
         $page = preg_replace("/\n\n\n+/", "\n\n", $page);
+        $page = preg_replace('/\s\s+/', ' ', $page);
         return $page;
     }
 }
diff --git a/src/library/summarizers/ScrapeSummarizer.php b/src/library/summarizers/ScrapeSummarizer.php
index 9f1b38293..e15353b3a 100644
--- a/src/library/summarizers/ScrapeSummarizer.php
+++ b/src/library/summarizers/ScrapeSummarizer.php
@@ -144,13 +144,13 @@ class ScrapeSummarizer extends Summarizer
             }
             foreach ($data as $datum) {
                 $datum = PhraseParser::compressSentence($datum, $lang);
-                $description .= " .. ". $datum;
+                $description .= " ..\n ". $datum;
                 if (self::OUTPUT_TO_FILE) {
                     if ($output_file_contents == "") {
                         $output_file_contents = trim($datum);
                     } else {
                         $output_file_contents = $output_file_contents .
-                            "\r\n" . trim($datum);
+                            "\n" . trim($datum);
                     }
                 }
             }
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index d735fe1fe..037134aa1 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -379,7 +379,7 @@ class Tokenizer
     {
         static $dictionary = [];
         static $dictionary = [];
-        $lexicon_file = C\LOCALE_DIR . "/en-US/resources/lexicon.txt.gz";
+        $lexicon_file = C\LOCALE_DIR . "/en_US/resources/lexicon.txt.gz";
         if (empty($dictionary)) {
             if (file_exists($lexicon_file)) {
                 $lines = gzfile($lexicon_file);
diff --git a/tests/HiTokenizerTest.php b/tests/HiTokenizerTest.php
index 0e8722e82..38b3b32d7 100644
--- a/tests/HiTokenizerTest.php
+++ b/tests/HiTokenizerTest.php
@@ -104,7 +104,7 @@ class HiTokenizerTest extends UnitTest
     {
         $tokenizer = $this->test_objects['FILE1'];
         //ideally will get work in new version
-        //echo
+        // echo
         // $tokenizer::tagPartsOfSpeechPhrase("महामा गाँधी का जम 2 अक्टूबर को हुआ");
     }
 }

ViewGit