diff --git a/bin/fetcher.php b/bin/fetcher.php index 362f65fbd..22cd013bc 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -1255,7 +1255,7 @@ class Fetcher implements CrawlConstants $this->found_sites[self::SEEN_URLS][] = $summary; $link_text = - mb_ereg_replace("[[:punct:]]", " ", $link_text); + mb_ereg_replace(PUNCT, " ", $link_text); $link_word_counts = PhraseParser::extractPhrasesAndCount($link_text); $link_shard->addDocumentWords($link_keys, @@ -1282,7 +1282,6 @@ class Fetcher implements CrawlConstants crawlLog(" Build mini inverted index time ". (changeInMicrotime($start_time))); } -} /** * Calculates the meta words to be associated with a given downloaded diff --git a/lib/fetch_url.php b/lib/fetch_url.php index e53fda044..e2772275b 100755 --- a/lib/fetch_url.php +++ b/lib/fetch_url.php @@ -254,7 +254,7 @@ class FetchUrl implements CrawlConstants $line = trim($line); if(stristr($line, 'Server:')) { $server_parts = explode("Server:", $line); - $server_name_parts = explode("/", $server_parts[1]); + $server_name_parts = @explode("/", $server_parts[1]); $site[CrawlConstants::SERVER] = @trim($server_name_parts[0]); if(isset($server_name_parts[1])) { $version_parts = explode("(", $server_name_parts[1]); diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index 8f2669f25..fce41deab 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -202,7 +202,6 @@ class GroupIterator extends IndexBundleIterator $done = false; do { $new_pages = $this->index_bundle_iterator->currentDocsWithWord(); - if(!is_array($new_pages)) { $done = true; if(count($pages) == 0) { @@ -218,6 +217,7 @@ class GroupIterator extends IndexBundleIterator $done = true; } } while(!$done); + return $pages; } diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php index d68f5d151..9e2a63193 100644 --- a/lib/index_bundle_iterators/intersect_iterator.php +++ b/lib/index_bundle_iterators/intersect_iterator.php @@ -96,9 +96,9 @@ class IntersectIterator extends IndexBundleIterator $this->results_per_block = 1; /* - the most results we can return is the size of the least num_docs - of what we are iterating over. We are also setting up here - that we return at most one posting at a time from each + We take an initial guess of the num_docs we returns as the sum + of the num_docs of the underlying iterators. We are also setting + up here that we return at most one posting at a time from each iterator */ for($i = 0; $i < $this->num_iterators; $i++) { diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index 73f45c9c4..65e444193 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -71,10 +71,11 @@ class HtmlProcessor extends TextProcessor $summary = NULL; if(is_string($page)) { $dom = self::dom($page); - if($dom !==false && self::checkMetaRobots($dom)) { - $summary[self::LANG] = self::lang($dom); + if($dom !== false && self::checkMetaRobots($dom)) { $summary[self::TITLE] = self::title($dom); - $summary[self::DESCRIPTION] = self::description($dom); + $summary[self::DESCRIPTION] = self::description($dom); + $summary[self::LANG] = self::lang($dom, + $summary[self::DESCRIPTION]); $summary[self::LINKS] = self::links($dom, $url); $summary[self::PAGE] = $page; if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) @@ -133,19 +134,37 @@ class HtmlProcessor extends TextProcessor /** * Determines the language of the html document by looking at the root - * language attribute + * language attribute. If that fails $sample_text is used to try to guess + * the language + * + * @param object $dom a document object to check the language of + * @param string $sample_text sample text to try guess the language from * - * @param object $dom - a document object to check the language of - * * @return string language tag for guessed language - */ - static function lang($dom) + static function lang($dom, $sample_text = NULL) { $xpath = new DOMXPath($dom); $html = $xpath->evaluate("/html"); + $lang = NULL; if(is_object($html->item(0))) { $lang = $html->item(0)->getAttribute('lang'); + } + if($lang == NULL && $sample_text != NULL){ + $words = mb_split("[[:space:]]|".PUNCT, $sample_text); + $num_words = count($words); + $ascii_count = 0; + foreach($words as $word) { + if(strlen($word) == mb_strlen($word)) { + $ascii_count++; + } + } + // crude, but let's guess ASCII == english + if($ascii_count/$num_words > 0.9) { + $lang = 'en'; + } else { + $lang = NULL; + } } else { $lang = NULL; } diff --git a/lib/processors/rss_processor.php b/lib/processors/rss_processor.php index 0ea3481b4..78f3d7622 100644 --- a/lib/processors/rss_processor.php +++ b/lib/processors/rss_processor.php @@ -73,11 +73,11 @@ class RssProcessor extends TextProcessor $dom = self::dom($page); if($dom !==false) { - $summary[self::LANG] = self::lang($dom); $summary[self::TITLE] = self::title($dom); $summary[self::DESCRIPTION] = self::description($dom); + $summary[self::LANG] = self::lang($dom, + $summary[self::DESCRIPTION]); $summary[self::LINKS] = self::links($dom, $url); - if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) { //maybe not rss? treat as text still try to get urls @@ -94,18 +94,35 @@ class RssProcessor extends TextProcessor * language tag * * @param object $dom - a document object to check the language of - * + * @param string $sample_text sample text to try guess the language from + * * @return string language tag for guessed language - */ - static function lang($dom) + static function lang($dom, $sample_text = NULL) { $xpath = new DOMXPath($dom); $languages = $xpath->evaluate("/rss/channel/language"); if($languages && is_object($languages)) { return $languages->item(0)->textContent; + } else if($sample_text != NULL){ + $words = mb_split("[[:space:]]|".PUNCT, $sample_text); + $num_words = count($words); + $ascii_count = 0; + foreach($words as $word) { + if(strlen($word) == mb_strlen($word)) { + $ascii_count++; + } + } + // crude, but let's guess ASCII == english + if($ascii_count/$num_words > 0.9) { + $lang = 'en'; + } else { + $lang = NULL; + } + } else { + $lang = NULL; } - return NULL; + return $lang; } /** diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php index 8daa72be5..a835e0f91 100755 --- a/lib/processors/text_processor.php +++ b/lib/processors/text_processor.php @@ -66,6 +66,8 @@ class TextProcessor implements CrawlConstants if(is_string($page)) { $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = mb_substr($page, 0, 400); + $summary[self::LANG] = self::calculateLang( + $summary[self::DESCRIPTION]); $summary[self::LINKS] = self::extractHttpHttpsUrls($page); $summary[self::PAGE] = "<html><body><pre>". strip_tags($page)."</pre></body></html>"; @@ -73,6 +75,38 @@ class TextProcessor implements CrawlConstants return $summary; } + + /** + * Tries to determine the language of the document by looking at the + * $sample_text provided + * the language + * @param string $sample_text sample text to try guess the language from + * + * @return string language tag for guessed language + */ + static function calculateLang($sample_text = NULL) + { + if($sample_text != NULL){ + $words = mb_split("[[:space:]]|".PUNCT, $sample_text); + $num_words = count($words); + $ascii_count = 0; + foreach($words as $word) { + if(strlen($word) == mb_strlen($word)) { + $ascii_count++; + } + } + // crude, but let's guess ASCII == english + if($ascii_count/$num_words > 0.9) { + $lang = 'en'; + } else { + $lang = NULL; + } + } else { + $lang = NULL; + } + return $lang; + } + /** * Gets the text between two tags in a document starting at the current * position.