diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index 25ff84f55..b1957e8fa 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -1634,13 +1634,11 @@ class CrawlComponent extends Component implements CrawlConstants $site[self::URL]); $phrase_string = $host_words . " .. ".$site[self::TITLE] . " .. ". $path_words . " .. ". $site[self::DESCRIPTION]; - if (!empty($site[self::TITLE])) { - $lang = L\guessLocaleFromString($site[self::TITLE], $lang); - } else { - $lang = L\guessLocaleFromString($site[self::DESCRIPTION], + if (empty($site[self::LANG])) { + $lang = L\guessLocaleFromString($phrase_string, $lang); + $site[self::LANG] = $lang; } - $site[self::LANG] = $lang; $word_lists = PhraseParser::extractPhrasesInLists( $phrase_string, $lang); $len = strlen($phrase_string); diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 6837951b0..6164c600e 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -2928,8 +2928,7 @@ class Fetcher implements CrawlConstants } if (empty($site[self::LANG])) { $lang = L\guessLocaleFromString( - mb_substr($site[self::DESCRIPTION], 0, - C\AD_HOC_TITLE_LENGTH), C\DEFAULT_LOCALE); + $site[self::DESCRIPTION]), C\DEFAULT_LOCALE); } else { $lang = $site[self::LANG]; } diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php index f977f3771..220423453 100755 --- a/src/library/LocaleFunctions.php +++ b/src/library/LocaleFunctions.php @@ -100,8 +100,7 @@ function guessLocale() */ function guessLocaleFromString($phrase_string, $locale_tag = null) { - $original_phrase_string = mb_substr($phrase_string, 0, - C\AD_HOC_TITLE_LENGTH); + $original_phrase_string = $phrase_string; $locale_tag = ($locale_tag == null) ? getLocaleTag() : $locale_tag; $sub = C\PUNCT . "|[0-9]|\s"; $phrase_string = preg_replace('/' . $sub . '/', "", $phrase_string); @@ -153,7 +152,17 @@ function guessLocaleFromString($phrase_string, $locale_tag = null) } } if ($locale_tag == 'en-US') { - $locale_tag = checkQuery($original_phrase_string); + $len = strlen($original_phrase_string); + $locale_tag = 'en-US'; + foreach (['en-US', 'fr-FR', 'es', 'it'] as $lang) { + $tokenizer = PhraseParser::getTokenizer($lang); + $test_len = + strlen($tokenizer->stopwordsRemover($original_phrase_string)); + if ($test_len < $len) { + $len = $test_len; + $locale_tag = $lang; + } + } } return $locale_tag; } diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 2d9393010..f49eb242d 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -1371,7 +1371,7 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け static $unsafe_terms = []; /* took keywords from top level domains from some of theporndude list */ - static $unsafe_host_regex = "/porn|xvideos|livejasmin|". + static $unsafe_url_regex = "/porn|xvideos|livejasmin|". "xhamster|bongacams|chaturbate|pussy|spankbang|". "xnxx|tnxx|beeg|daftsex|redtube|youjizz|vidz7|4tube|cumlouder|" . "tnaflix|xfantasy|vdiz24|luxuretv|perfectgirls|anysex|drtuber|" . @@ -1399,8 +1399,8 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け "luscious\.net|hentai|freeones\.com|iafd|gayboystube|". "adam4adam|cams\.com|mrskin|adultwork|oglaf|streamate|". "nifty\.org|adultdvd|suicidegirls|ftvgirls|asstr|private\.com|". - "squirt\.org|fakku|faapy|fux\.com|txxx/i"; - if (!empty($url) && preg_match($unsafe_host_regex, $url)) { + "squirt\.org|fakku|faapy|fux\.com|txxx|\Wnude\W/i"; + if (!empty($url) && preg_match($unsafe_url_regex, $url)) { return 1; } if (count($word_lists) == 0) { diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php index 7248b7b9d..0d0ce8f10 100755 --- a/src/library/WebArchive.php +++ b/src/library/WebArchive.php @@ -118,7 +118,7 @@ class WebArchive } else { $this->iterator_pos = 0; $this->count = 0; - $fh = fopen($this->filename, "w"); + $fh = fopen($this->filename, "w"); $this->writeInfoBlock($fh); fclose($fh); } @@ -221,7 +221,7 @@ class WebArchive * used to modify $data before it is written * to the info block. For instance, we can add offset info to data. * @param bool $return_flag if true rather than adjust the offsets by - * reference, create copy objects and adjust their offsets anf return + * reference, create copy objects and adjust their offsets and return * @return mixed adjusted objects or void */ public function addObjects($offset_field, &$objects, diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php index 38686c96e..a70e5a7ff 100755 --- a/src/library/WebArchiveBundle.php +++ b/src/library/WebArchiveBundle.php @@ -232,7 +232,7 @@ class WebArchiveBundle * * @param int $index the number of the partition within this bundle to * return - * @param bool $fast_construct should the constructor of the WebArchive + * @param bool $fast_construct tells the constructor of the WebArchive * avoid reading in its info block. * @return object the WebArchive file which was requested */ @@ -246,14 +246,24 @@ class WebArchiveBundle $create_flag = false; $compressor = C\NS_LIB . "compressors\\" . $this->compressor; $compressor_obj = new $compressor(); - $archive_name = $this->dir_name."/web_archive_".$index + $archive_name = $this->dir_name."/web_archive_" . $index . $compressor_obj->fileExtension(); if (!file_exists($archive_name)) { $create_flag = true; } + $archive_name_exists = file_exists($archive_name); $this->partition[$index] = new WebArchive($archive_name, new $compressor(), $fast_construct); + if (!$archive_name_exists) { + /* always add a dummy record so an offset 0 of a real record + can never be legit. This is just to be on the safe side + if a changeDocumentOffsets in IndexShard happens not to work + */ + $dummy_pages = [["DUMMY"]]; + $this->partition[$index]->addObjects("DUMMY_OFFSET", + $dummy_pages); + } if ($create_flag && file_exists($archive_name)) { chmod($archive_name, 0777); } diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php index 0cc9f138d..549df7bf3 100755 --- a/src/library/processors/HtmlProcessor.php +++ b/src/library/processors/HtmlProcessor.php @@ -120,7 +120,7 @@ class HtmlProcessor extends TextProcessor $summary[self::TITLE] = self::crudeTitle($dom_page); } $summary[self::LANG] = self::lang($dom, - $summary[self::TITLE], $url); + strip_tags($page), $url); $description_dom = $dom; if (!empty($scraper)) { $scrape_results = ScraperManager::applyScraperRules( diff --git a/src/library/summarizers/ScrapeSummarizer.php b/src/library/summarizers/ScrapeSummarizer.php index 8d4f085a0..279146bfa 100644 --- a/src/library/summarizers/ScrapeSummarizer.php +++ b/src/library/summarizers/ScrapeSummarizer.php @@ -100,7 +100,7 @@ class ScrapeSummarizer extends Summarizer } if ($pos > 0 && !empty($block_ranks[$changeable_index]) && $max_score/$score_pos < $block_ranks[$changeable_index]) { - $fixed_summary_len += strlen($block_ranks[$changeable_index]); + $fixed_summary_len += strlen($blocks[$changeable_index]); $changeable_index++; if ($fixed_summary_len > $max_summary_len) { break; diff --git a/src/library/summarizers/Summarizer.php b/src/library/summarizers/Summarizer.php index 1113f998f..65020eca5 100644 --- a/src/library/summarizers/Summarizer.php +++ b/src/library/summarizers/Summarizer.php @@ -351,14 +351,15 @@ class Summarizer $summary = ""; $summary_length = 0; $top = self::numSentencesForSummary($sentence_scores, $sentences); - if ($top <= 1) { + if ($top < 1) { if (!empty($sentences[0])) { - $summary = $sentences[0]; + $summary = substr($sentences[0], 0, + PageProcessor::$max_description_len); return [ltrim($summary), [1]]; } } $summary_indices = array_keys(array_slice($sentence_scores, 0, - $top - 1, true)); + $top, true)); sort($summary_indices); $eos = ($lang == 'hi') ? "।" : "."; //default end of sentence symbol $summary_scores = []; diff --git a/src/models/LocaleModel.php b/src/models/LocaleModel.php index 0bcdb345e..581bf0889 100755 --- a/src/models/LocaleModel.php +++ b/src/models/LocaleModel.php @@ -146,7 +146,7 @@ class LocaleModel extends Model the statistics text file contains info used to calculate what fraction of strings have been translated */ - $tag_prefix = C\LOCALE_DIR."/".str_replace("-", "_", + $tag_prefix = C\LOCALE_DIR . "/" . str_replace("-", "_", $locale['LOCALE_TAG']); if (!file_exists($tag_prefix)) { mkdir($tag_prefix); //create locale_dirs that are missing @@ -386,8 +386,8 @@ class LocaleModel extends Model { $sql = "DELETE FROM LOCALE WHERE LOCALE_TAG = ?"; $this->db->execute($sql, [$locale_tag]); - if (file_exists(C\LOCALE_DIR."/$locale_tag")) { - $this->db->unlinkRecursive(C\LOCALE_DIR."/$locale_tag", true); + if (file_exists(C\LOCALE_DIR . "/$locale_tag")) { + $this->db->unlinkRecursive(C\LOCALE_DIR . "/$locale_tag", true); } } /** diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php index d69873529..1b97675d4 100755 --- a/src/models/ParallelModel.php +++ b/src/models/ParallelModel.php @@ -292,6 +292,16 @@ class ParallelModel extends Model $index = IndexManager::getIndex($index_name); if (is_integer($summary_offset) && is_integer($generation)) { + if ($summary_offset == 0) { + /* + we insert a dummy object at start of each + partition, so no legit lookup should have + offset 0. It still might happen, if a + changeDocumentOffsets failed, so we add + this check + */ + continue; + } $page = $index->getPage($summary_offset, $generation); } else { diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 33210d6f7..c782e9809 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -895,7 +895,7 @@ class PhraseModel extends ParallelModel $matches = $matches[2]; $result_phrase = preg_replace($pattern, "", $phrase); foreach ($matches as $match) { - $tag = L\guessLocaleFromString($match, $lang_tag, 10); + $tag = L\guessLocaleFromString($match, $lang_tag); $not_check = true; foreach ($not_contains as $not_contain) { if (strstr($match, $not_contain)) { @@ -940,7 +940,7 @@ class PhraseModel extends ParallelModel $matches = $matches[2]; $result_phrase = preg_replace($pattern, " ", $phrase); foreach ($matches as $match) { - $tag = L\guessLocaleFromString($match, $lang_tag, 10); + $tag = L\guessLocaleFromString($match, $lang_tag); $not_check = true; foreach ($not_contains as $not_contain) { if (strstr($match, $not_contain)) {