diff --git a/bin/fetcher.php b/bin/fetcher.php index ffc8a3140..75a2c2624 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -596,7 +596,7 @@ class Fetcher implements CrawlConstants $not_downloaded[] = $site; } else { $duplicates[] = $site[self::URL]; - echo "Deduplicated:".$site[self::URL]."\n"; + crawlLog("Deduplicated:".$site[self::URL]); } } @@ -1064,10 +1064,12 @@ class Fetcher implements CrawlConstants $description_length = $info[self::DESCRIPTION_LENGTH]; $link_length = $info[self::LINK_LENGTH]; - $title_ratio = $title_length/$average_title_length; - $description_ratio = - $description_length/$average_description_length; - $link_ratio = $link_length/$average_total_link_text_length; + $title_ratio = ($average_title_length > 0) ? + $title_length/$average_title_length : 0; + $description_ratio = ($average_description_length > 0) ? + $description_length/$average_description_length :0; + $link_ratio = ($average_total_link_text_length > 0) ? + $link_length/$average_total_link_text_length : 0; if(isset($info[self::TITLE_WORDS])) { foreach($info[self::TITLE_WORDS] diff --git a/controllers/search_controller.php b/controllers/search_controller.php index 0328bf634..8b027f373 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -376,11 +376,19 @@ class SearchController extends Controller implements CrawlConstants $dom = new DOMDocument(); - @$dom->loadHTML($cache_file); + $did_dom = @$dom->loadHTML($cache_file); + $xpath = new DOMXPath($dom); $body = $dom->getElementsByTagName('body')->item(0); + if($body == false) { + $cache_file = "<html><head><title>Yioop! Cache</title></head>". + "<body>".htmlentities($cache_file)."</body></html>"; + $dom = new DOMDocument(); + @$dom->loadHTML($cache_file); + $body = $dom->getElementsByTagName('body')->item(0); + } $first_child = $body->firstChild; $divNode = $dom->createElement('div'); @@ -402,13 +410,19 @@ class SearchController extends Controller implements CrawlConstants $i = 0; foreach($words as $word) { - if(strlen($word) > 0) { - $match = crawlHash($word).$word; - $newDoc = preg_replace("/$match/i", - '<span style="background-color:'. - $colors[$i].'">$0</span>', $newDoc); - $i = ($i + 1) % $color_count; - $newDoc = preg_replace("/".crawlHash($word)."/", "", $newDoc); + //only mark string of length at least 2 + if(strlen($word) > 1) { + $mark_prefix = crawlHash($word); + if(stristr($mark_prefix, $word) !== false) { + $mark_prefix = preg_replace( + "/$word/i", '', $mark_prefix); + } + $match = $mark_prefix.$word; + $newDoc = preg_replace("/$match/i", + '<span style="background-color:'. + $colors[$i].'">$0</span>', $newDoc); + $i = ($i + 1) % $color_count; + $newDoc = preg_replace("/".$mark_prefix."/", "", $newDoc); } } @@ -442,9 +456,15 @@ class SearchController extends Controller implements CrawlConstants $text = $clone->textContent; foreach($words as $word) { - if(strlen($word) > 0) { + //only mark string of length at least 2 + if(strlen($word) > 1) { + $mark_prefix = crawlHash($word); + if(stristr($mark_prefix, $word) !== false) { + $mark_prefix = preg_replace( + "/$word/i", '', $mark_prefix); + } $text = preg_replace( - "/$word/i", crawlHash($word).'$0', $text); + "/$word/i", $mark_prefix.'$0', $text); } } diff --git a/lib/index_shard.php b/lib/index_shard.php index 03447bee1..e903bc5b1 100644 --- a/lib/index_shard.php +++ b/lib/index_shard.php @@ -43,4 +43,49 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} class IndexShard extends PersistentStructure implements Serializable { + var $doc_ids; + var $word_docs; + var $count_doc256; + + function __construct() + { + } + + function addDocumentWords($doc_id, $word_id_array) + { + $this->doc_ids[] = $doc_id; + + foreach($word_id_arr as $word_id => $relevance) { + $relevance = $relevance & 255; + $store = pack("N", $this->count_doc256 + $relevance); + $this->word_docs[$word_id] .= $store; + } + + $this->count_doc256 += 256; + } + + function getWordSlice($word_id, $start, $len) + { + $result = array(); + if(isset($word_docs[$word_id])) { + $docs_string = substr($word_docs[$word_id], $start << 2, $len <<2); + //check if got at least one item + if($docs_string !== false && ($doc_len = strlen($doc_string)) > 3) { + for($i = 0; $i < $doc_len; $i += 4) { + } + } + } + + return $result; + } + + function appendIndexShard($index_shard) + { + } + + function docCount() + { + return ($this->count_doc256 >> 8); + } + } diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index 4dbc73758..9d2846fe7 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -68,19 +68,24 @@ class HtmlProcessor extends TextProcessor */ public static function process($page, $url) { + $summary = NULL; if(is_string($page)) { $dom = self::dom($page); - if(self::checkMetaRobots($dom)) { + if($dom !==false && self::checkMetaRobots($dom)) { $summary[self::TITLE] = self::title($dom); $summary[self::DESCRIPTION] = self::description($dom); $summary[self::LINKS] = self::links($dom, $url); - return $summary; + if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) + == 0 && count($summary[self::LINKS]) == 0) { + //maybe not html? treat as text still try to get urls + $summary = parent::process($page, url); + } } } - return NULL; + return $summary; } diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php index c98580447..e9714da78 100755 --- a/lib/processors/text_processor.php +++ b/lib/processors/text_processor.php @@ -67,7 +67,8 @@ class TextProcessor implements CrawlConstants $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = mb_substr($page, 0, 400); $summary[self::LINKS] = self::extractHttpHttpsUrls($page); - $summary[self::PAGE] = "<html><body><pre>$page</pre></body></html>"; + $summary[self::PAGE] = "<html><body><pre>". + strip_tags($page)."</pre></body></html>"; } return $summary; } diff --git a/lib/string_array.php b/lib/string_array.php index 831ea4dd8..6d830f865 100755 --- a/lib/string_array.php +++ b/lib/string_array.php @@ -41,7 +41,9 @@ require_once "persistent_structure.php"; /** * Memory efficient implementation of persistent arrays * - * The standard array ob + * The standard array objects in php and even spl have a large amount of + * overhead. The point of this class is to have the size as close to the + * optimal as possible * * @author Chris Pollett * diff --git a/models/model.php b/models/model.php index d82d5c31c..426d3b9c5 100755 --- a/models/model.php +++ b/models/model.php @@ -132,6 +132,10 @@ class Model implements CrawlConstants $page[self::TITLE] = substr(strip_tags($page[self::DESCRIPTION]), 0, $end_title). $ellipsis; + //still no text revert to url + if(strlen($page[self::TITLE]) == 0) { + $page[self::TITLE] = $page[self::URL]; + } } diff --git a/models/phrase_model.php b/models/phrase_model.php index e6d2d6db0..b92584645 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -137,6 +137,14 @@ class PhraseModel extends Model } + /** + * Determines the offset into the summaries WebArchiveBundle of the + * provided url so that it can be retrieved. This relies on the + * fact that the info:url meta word has been stored. + * + * @param string $url what to lookup + * @return int offset into the web archive bundle + */ function lookupSummaryOffset($url) { $index_archive_name = self::index_data_base_name . $this->index_name; @@ -160,6 +168,15 @@ class PhraseModel extends Model return $summary_offset; } + /** + * Parses from a string phrase representing a conjunctive query, a struct + * consisting of the words keys searched for, the allowed and disallowed + * phrases, the weight that should be put on these query results, and + * which archive to use. + * + * @param string $phrase string to extract struct from + * @return array struct representing the conjunctive query + */ function parseWordStructConjunctiveQuery($phrase) { $phrase = " ".$phrase; diff --git a/views/search_view.php b/views/search_view.php index 9be7cc805..71cadab4b 100755 --- a/views/search_view.php +++ b/views/search_view.php @@ -115,9 +115,10 @@ class SearchView extends View implements CrawlConstants <div class='result'> <h2> <a href="<?php if($page[self::TYPE] != "link") { - e($page[self::URL]); - } else - e(strip_tags($page[self::TITLE])); ?>" ><?php + e($page[self::URL]); + } else { + e(strip_tags($page[self::TITLE])); + } ?>" ><?php if(isset($page[self::THUMB]) && $page[self::THUMB] != 'NULL') { ?><img src="<?php e($page[self::THUMB]); ?>" alt="<?php e($page[self::TITLE]); ?>" /> <?php