diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index 99d9f7802..9b26b2d29 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -1518,7 +1518,7 @@ class CrawlComponent extends Component implements CrawlConstants } $meta_ids = PhraseParser::calculateMetas($site); if (!$site[self::JUST_METAS]) { - $host_words = UrlParser::getWordsIfHostUrl($site[self::URL]); + $host_words = UrlParser::getWordsInHostUrl($site[self::URL]); $path_words = UrlParser::getWordsLastPathPartUrl( $site[self::URL]); $phrase_string = $host_words." .. ".$site[self::TITLE] . diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 80f424aef..f098830af 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -911,7 +911,7 @@ class ArcTool implements CrawlConstants */ $lang = null; if (!isset($site[self::JUST_METAS])) { - $host_words = UrlParser::getWordsIfHostUrl($site_url); + $host_words = UrlParser::getWordsInHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl( $site_url); if ($is_link) { diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index fd81f39cb..346e50176 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -2779,7 +2779,7 @@ class Fetcher implements CrawlConstants */ $lang = null; if (!isset($site[self::JUST_METAS])) { - $host_words = UrlParser::getWordsIfHostUrl($site_url); + $host_words = UrlParser::getWordsInHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl( $site_url); if ($is_link) { diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 63177a66d..46efbdaa8 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -361,10 +361,16 @@ class PhraseParser "tagTokenizePartOfSpeech") && !isset(self::$programming_language_map[$lang])) { $string = mb_strtolower($string); - $sentences = preg_split("/(\n\n+)|\.|\!|\?|。/u", $string); + $pre_sentences = preg_split("/(\n\n+)|\.|\!|\?|。/u", $string); $pos = 0; $sentences_pos = []; - $sentences = array_filter($sentences); + $sentences = []; + foreach ($pre_sentences as $pre_sentence) { + $pre_sentence = trim($pre_sentence); + if (!empty($pre_sentence)) { + $sentences[] = $pre_sentence; + } + } foreach ($sentences as $sentence) { if (empty($sentences_pos[$sentence])) { $sentences_pos[$sentence] = [$pos]; @@ -404,7 +410,7 @@ class PhraseParser if ($lang == "hi") { $string = preg_replace('/(,:)\p{P}/u', "", $string); } - $string = mb_ereg_replace("\s+|".C\PUNCT, " ", $string); + $string = mb_ereg_replace("\s+|" . C\PUNCT, " ", $string); $terms = self::segmentSegment($string, $lang); $terms = self::charGramTerms($terms, $lang); $terms = self::stemTerms($terms, $lang); diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index 2ab932772..88939871e 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -482,7 +482,7 @@ class UrlParser * @return string space separated words extracted. * */ - public static function getWordsIfHostUrl($url) + public static function getWordsInHostUrl($url) { $words = []; $url_parts = @parse_url($url); diff --git a/src/library/indexing_plugins/AddressesPlugin.php b/src/library/indexing_plugins/AddressesPlugin.php index 9e2b5cd4d..def10ed56 100644 --- a/src/library/indexing_plugins/AddressesPlugin.php +++ b/src/library/indexing_plugins/AddressesPlugin.php @@ -532,7 +532,7 @@ class AddressesPlugin extends IndexingPlugin implements CrawlConstants public function parsePhones($line) { $phones = []; - $line = preg_replace('/('.C\PUNCT.'|\s)+/',"", $line); + $line = preg_replace('/(' . C\PUNCT . '|\s)+/',"", $line); $phone_keywords = "/sales|mobile|phone|call|电话|電話|fono|fone|". "fon|foon|전화|φωνο|фон/ui"; $phone_parts = preg_split($phone_keywords, $line); diff --git a/src/library/indexing_plugins/IndexingPlugin.php b/src/library/indexing_plugins/IndexingPlugin.php index fc831851e..2a018bdb1 100644 --- a/src/library/indexing_plugins/IndexingPlugin.php +++ b/src/library/indexing_plugins/IndexingPlugin.php @@ -26,7 +26,7 @@ * @author Priya Gangaraju priya.gangaraju@gmail.com, Chris Pollett * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ - * @copyright 2011 - 2014 + * @copyright 2011 - 2018 * @filesource */ namespace seekquarry\yioop\library\indexing_plugins; diff --git a/src/library/indexing_plugins/RecipePlugin.php b/src/library/indexing_plugins/RecipePlugin.php index 63cc4ce78..30e84a4fe 100644 --- a/src/library/indexing_plugins/RecipePlugin.php +++ b/src/library/indexing_plugins/RecipePlugin.php @@ -27,7 +27,7 @@ * chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ - * @copyright 2011 -2017 + * @copyright 2011 - 2018 * @filesource */ namespace seekquarry\yioop\library\indexing_plugins; diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php index a980a76ef..59c1be909 100755 --- a/src/library/processors/HtmlProcessor.php +++ b/src/library/processors/HtmlProcessor.php @@ -377,7 +377,7 @@ class HtmlProcessor extends TextProcessor //look for a meta tag with a description foreach ($metas as $meta) { if (stristr($meta->getAttribute('name'), "description")) { - $description .= " .. ".$meta->getAttribute('content'); + $description .= " .. " . $meta->getAttribute('content'); } } if (self::$max_description_len > 2 * C\MAX_DESCRIPTION_LEN) { @@ -433,7 +433,6 @@ class HtmlProcessor extends TextProcessor if ($first_len > 3 * $add_len) break; } $description = preg_replace("/(\s)+/u", " ", $description); - return $description; } /** diff --git a/src/library/summarizers/CentroidSummarizer.php b/src/library/summarizers/CentroidSummarizer.php index 8fc24adb1..fc743a7b7 100644 --- a/src/library/summarizers/CentroidSummarizer.php +++ b/src/library/summarizers/CentroidSummarizer.php @@ -180,12 +180,12 @@ class CentroidSummarizer extends Summarizer } $i++; } - if (strlen($formatted_doc) < PageProcessor::$max_description_len + if (strlen($page) < PageProcessor::$max_description_len || $n == 1) { //if input short only use above to get a word cloud - $formatted_doc = substr($formatted_doc, 0, + $page = substr($page, 0, PageProcessor::$max_description_len); - return [$formatted_doc, $word_cloud]; + return [$page, $word_cloud]; } ksort($wc); /* Calculate similarity measure between centroid and each sentence */ @@ -335,7 +335,6 @@ class CentroidSummarizer extends Summarizer '/\[(.*?)\]/', '/\t\n/' ]; $page = preg_replace($substitutions, ' ', $page); - $page = preg_replace('/\s{2,}/', ' ', $page); $new_page = preg_replace("/\<br\s*(\/)?\s*\>/", "\n", $page); $changed = false; if ($new_page != $page) { @@ -348,12 +347,12 @@ class CentroidSummarizer extends Summarizer $page = preg_replace("/\&\#\d{3}(\d?)\;|\&\w+\;/", " ", $page); $page = preg_replace("/\</", " <", $page); $page = strip_tags($page); - if ($changed) { $page = preg_replace("/(\r?\n[\t| ]*){2}/", "\n", $page); } $page = preg_replace("/(\r?\n[\t| ]*)/", "\n", $page); $page = preg_replace("/\n\n\n+/", "\n\n", $page); + $page = preg_replace('/\s\s+/', ' ', $page); return $page; } } diff --git a/src/library/summarizers/ScrapeSummarizer.php b/src/library/summarizers/ScrapeSummarizer.php index 9f1b38293..e15353b3a 100644 --- a/src/library/summarizers/ScrapeSummarizer.php +++ b/src/library/summarizers/ScrapeSummarizer.php @@ -144,13 +144,13 @@ class ScrapeSummarizer extends Summarizer } foreach ($data as $datum) { $datum = PhraseParser::compressSentence($datum, $lang); - $description .= " .. ". $datum; + $description .= " ..\n ". $datum; if (self::OUTPUT_TO_FILE) { if ($output_file_contents == "") { $output_file_contents = trim($datum); } else { $output_file_contents = $output_file_contents . - "\r\n" . trim($datum); + "\n" . trim($datum); } } } diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php index d735fe1fe..037134aa1 100755 --- a/src/locale/en_US/resources/Tokenizer.php +++ b/src/locale/en_US/resources/Tokenizer.php @@ -379,7 +379,7 @@ class Tokenizer { static $dictionary = []; static $dictionary = []; - $lexicon_file = C\LOCALE_DIR . "/en-US/resources/lexicon.txt.gz"; + $lexicon_file = C\LOCALE_DIR . "/en_US/resources/lexicon.txt.gz"; if (empty($dictionary)) { if (file_exists($lexicon_file)) { $lines = gzfile($lexicon_file); diff --git a/tests/HiTokenizerTest.php b/tests/HiTokenizerTest.php index 0e8722e82..38b3b32d7 100644 --- a/tests/HiTokenizerTest.php +++ b/tests/HiTokenizerTest.php @@ -104,7 +104,7 @@ class HiTokenizerTest extends UnitTest { $tokenizer = $this->test_objects['FILE1']; //ideally will get work in new version - //echo + // echo // $tokenizer::tagPartsOfSpeechPhrase("महामा गाँधी का जम 2 अक्टूबर को हुआ"); } }