diff --git a/bin/fetcher.php b/bin/fetcher.php index 6c1339626..99d0bd0da 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -703,7 +703,7 @@ class Fetcher implements CrawlConstants } /** - * + * @param array &$info */ function setCrawlParamsFromArray(&$info) { diff --git a/controllers/search_controller.php b/controllers/search_controller.php index 34046c89c..36f3b5bb9 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -90,13 +90,19 @@ class SearchController extends Controller implements CrawlConstants { $data = array(); $view = "search"; + $start_time = microtime(); + if(isset($_REQUEST['f']) && $_REQUEST['f']=='rss' && RSS_ACCESS) { $view = "rss"; } else if (!WEB_ACCESS) { return; } - $start_time = microtime(); + if(isset($_REQUEST['raw']) && $_REQUEST['raw'] == true) { + $raw = true; + } else { + $raw = false; + } if(isset($_SESSION['MAX_PAGES_TO_SHOW']) ) { $results_per_page = $_SESSION['MAX_PAGES_TO_SHOW']; @@ -180,7 +186,7 @@ class SearchController extends Controller implements CrawlConstants $data = $this->processQuery( $query, $activity, $arg, - $results_per_page, $limit, $index_time_stamp); + $results_per_page, $limit, $index_time_stamp, $raw); // calculate the results of a search if there is one } else { $highlight = true; @@ -236,10 +242,14 @@ class SearchController extends Controller implements CrawlConstants * for those query terms will be return, then the eleventh, etc. * @param int $index_name the timestamp of an index to use, if 0 then * default used + * @param int $raw ($raw == 0) normal grouping, ($raw == 1) + * no grouping but page look-up for links, ($raw == 2) + * no grouping done on data + * * @return array an array of at most results_per_page many search results */ function processQuery($query, $activity, $arg, $results_per_page, - $limit = 0, $index_name = 0) + $limit = 0, $index_name = 0, $raw = 0) { $no_index_given = false; if($index_name == 0) { @@ -291,7 +301,7 @@ class SearchController extends Controller implements CrawlConstants $top_query = implode(" ", $top_phrases); $phrase_results = $this->phraseModel->getPhrasePageResults( $top_query, $limit, $results_per_page, false, NULL, - $use_cache_if_possible); + $use_cache_if_possible, $raw); $data['PAGING_QUERY'] = "index.php?c=search&a=related&arg=". urlencode($url); @@ -329,7 +339,7 @@ class SearchController extends Controller implements CrawlConstants $filter = $this->searchfiltersModel->getFilter(); $phrase_results = $this->phraseModel->getPhrasePageResults( $query, $limit, $results_per_page, true, $filter, - $use_cache_if_possible); + $use_cache_if_possible, $raw); $query = $original_query; } $data['PAGING_QUERY'] = "index.php?q=".urlencode($query); @@ -451,14 +461,18 @@ class SearchController extends Controller implements CrawlConstants * cache: queries) * @param int $results_per_page number of results to return * @param int $limit first result to return from the ordered query results + * @param int $raw ($raw == 0) normal grouping, ($raw == 1) + * no grouping but page look-up for links, ($raw == 2) + * no grouping done on data * * @return array associative array of results for the query performed */ - public function queryRequest($query, $results_per_page, $limit = 0) + public function queryRequest($query, $results_per_page, $limit = 0, + $raw = 0) { return (API_ACCESS) ? $this->processQuery($query, "query", "", $results_per_page, - $limit) : NULL; + $limit, $raw) : NULL; } /** @@ -468,16 +482,18 @@ class SearchController extends Controller implements CrawlConstants * @param string $url to find related documents for * @param int $results_per_page number of results to return * @param int $limit first result to return from the ordered query results + * @param int $raw ($raw == 0) normal grouping, ($raw == 1) + * no grouping but page look-up for links, ($raw == 2) + * no grouping done on data * * @return array associative array of results for the query performed - */ public function relatedRequest($url, $results_per_page, $limit = 0, - $crawl_time = 0) + $crawl_time = 0, $raw = 0) { return (API_ACCESS) ? $this->processQuery("", "related", $url, $results_per_page, - $limit, $crawl_time) : NULL; + $limit, $crawl_time, $raw) : NULL; } /** diff --git a/lib/fetch_url.php b/lib/fetch_url.php index b3f4a486f..4cabe25d9 100755 --- a/lib/fetch_url.php +++ b/lib/fetch_url.php @@ -69,8 +69,6 @@ class FetchUrl implements CrawlConstants $key=CrawlConstants::URL, $value=CrawlConstants::PAGE, $hash=CrawlConstants::HASH) { - static $ex_cnt = 0; - $agent_handler = curl_multi_init(); $active = NULL; diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index 8426ef8f9..d998f934a 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -100,10 +100,20 @@ class GroupIterator extends IndexBundleIterator var $grouped_hashes; /** + * Used to keep track and to weight pages based on the number of other + * pages from the same domain * @var array */ var $domain_factors; + /** + * Flag used to tell group iterator whether to do a usual grouping + * or to only look-up parent pages for links for which a parent page + * hasn't been seen + * @var bool + */ + var $only_lookup; + /** * the minimum number of pages to group from a block; * this trumps $this->index_bundle_iterator->results_per_block @@ -121,15 +131,21 @@ class GroupIterator extends IndexBundleIterator * @param object $index_bundle_iterator to use as a source of documents * to iterate over */ - function __construct($index_bundle_iterator, $num_iterators = 1) + function __construct($index_bundle_iterator, $num_iterators = 1, + $only_lookup = false) { $this->index_bundle_iterator = $index_bundle_iterator; $this->num_docs = $this->index_bundle_iterator->num_docs; - $this->results_per_block = max( - $this->index_bundle_iterator->results_per_block, - self::MIN_FIND_RESULTS_PER_BLOCK); - - $this->results_per_block /= ceil($num_iterators/2); + if($only_lookup) { + $this->results_per_block = + $this->index_bundle_iterator->results_per_block; + } else { + $this->results_per_block = max( + $this->index_bundle_iterator->results_per_block, + self::MIN_FIND_RESULTS_PER_BLOCK); + $this->results_per_block /= ceil($num_iterators/2); + } + $this->only_lookup = $only_lookup; $this->reset(); } @@ -179,21 +195,28 @@ class GroupIterator extends IndexBundleIterator $this->current_block_hashes = array(); $this->current_seen_hashes = array(); if($this->count_block_unfiltered > 0 ) { - /* next we group like documents by url and remember which urls we've - seen this block - */ - - $pre_out_pages = $this->groupByHashUrl($pages); - - /*get doc page for groups of link data if exists and don't have - also aggregate by hash - */ - $this->groupByHashAndAggregate($pre_out_pages); - $this->count_block = count($pre_out_pages); - /* - Calculate aggregate values for each field of the groups we found - */ - $pages = $this->computeOutPages($pre_out_pages); + if($this->only_lookup) { + + $pages = $this->insertUnseenDocs($pages); + $this->count_block = count($pages); + } else { + /* next we group like documents by url and remember + which urls we've seen this block + */ + + $pre_out_pages = $this->groupByHashUrl($pages); + + /*get doc page for groups of link data if exists and don't have + also aggregate by hash + */ + $this->groupByHashAndAggregate($pre_out_pages); + $this->count_block = count($pre_out_pages); + /* + Calculate aggregate values for each field of the groups we + found + */ + $pages = $this->computeOutPages($pre_out_pages); + } } $this->pages = $pages; return $pages; @@ -293,44 +316,24 @@ class GroupIterator extends IndexBundleIterator { $domain_vector = array(); foreach($pre_out_pages as $hash_url => $data) { - if(!$pre_out_pages[$hash_url][0][self::IS_DOC]) { - $hash_info_url= - crawlHash("info:".base64Hash($hash_url), true); - $index = $this->getIndex($pre_out_pages[$hash_url][0]['KEY']); - $word_iterator = - new WordIterator($hash_info_url, - $index, true); - $doc_array = $word_iterator->currentDocsWithWord(); - if(is_array($doc_array) && count($doc_array) == 1) { - $relevance = $this->computeRelevance( - $word_iterator->current_generation, - $word_iterator->current_offset); - $keys = array_keys($doc_array); - $key = $keys[0]; - $item = $doc_array[$key]; - $item[self::RELEVANCE] = $relevance; - $item[self::SCORE] += $relevance; - $item['KEY'] = $key; - $item['INDEX'] = $word_iterator->index; - $item[self::HASH] = substr($key, - IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN); - $item[self::INLINKS] = substr($key, - 2*IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN); + if(!$data[0][self::IS_DOC]) { + $item = $this->lookupDoc($data[0]['KEY']); + if($item != false) { array_unshift($pre_out_pages[$hash_url], $item); } } $this->aggregateScores($hash_url, $pre_out_pages[$hash_url]); - if(isset($pre_out_pages[$hash_url][0][self::HASH])) { - $hash = $pre_out_pages[$hash_url][0][self::HASH]; + if(isset($pre_out_pages[$hash_url][self::HASH])) { + $hash = $pre_out_pages[$hash_url][self::HASH]; if(isset($this->grouped_hashes[$hash])) { unset($pre_out_pages[$hash_url]); } else if(isset($this->current_seen_hashes[$hash])) { $previous_url = $this->current_seen_hashes[$hash]; if($pre_out_pages[$previous_url][0][ self::HASH_SUM_SCORE] >= - $pre_out_pages[$hash_url][0][self::HASH_SUM_SCORE]) { + $pre_out_pages[$hash_url][0][self::HASH_SUM_SCORE]){ unset($pre_out_pages[$hash_url]); } else { $this->current_seen_hashes[$hash] = $hash_url; @@ -343,6 +346,109 @@ class GroupIterator extends IndexBundleIterator } } + /** + * Looks up a doc for a link doc_key, so can get its summary info + * + * @param string $doc_key key to look up doc of + * + * @return array consisting of info about the doc + */ + function lookupDoc($doc_key) + { + $hash_url = substr($doc_key, 0, IndexShard::DOC_KEY_LEN); + $hash_info_url= + crawlHash("info:".base64Hash($hash_url), true); + $index = $this->getIndex($doc_key); + $word_iterator = + new WordIterator($hash_info_url, + $index, true); + $doc_array = $word_iterator->currentDocsWithWord(); + $item = false; + if(is_array($doc_array) && count($doc_array) == 1) { + $relevance = $this->computeRelevance( + $word_iterator->current_generation, + $word_iterator->current_offset); + $keys = array_keys($doc_array); + $key = $keys[0]; + $item = $doc_array[$key]; + $item[self::RELEVANCE] = $relevance; + $item[self::SCORE] = $item[self::DOC_RANK]*pow(1.1, $relevance); + $item['KEY'] = $key; + $item['INDEX'] = $word_iterator->index; + $item[self::HASH] = substr($key, + IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN); + $item[self::INLINKS] = substr($key, + 2*IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN); + } + return $item; + } + + /** + * This function is called if $raw mode 1 was requested. In this + * mode no grouping is done, but it a link does not correspond to + * a doc file already listed, then an attempt to look up the doc is + * done + * + * @param array $pages an array of links or docs returned by the + * iterator that had been fed into this group iterator + * + * @return array new pages where docs have been added if possible + */ + function insertUnseenDocs($pages) + { + $new_pages = array(); + $doc_keys = array_keys($pages); + $need_docs = array(); + foreach($doc_keys as $key) { + $hash_url = substr($key, 0, IndexShard::DOC_KEY_LEN); + $need_docs[$hash_url] = $key; + } + $need_docs = array_diff_key($need_docs, $this->grouped_keys); + foreach($pages as $doc_key => $doc_info) { + $doc_info['KEY'] = $doc_key; + $hash_url = substr($doc_key, 0, IndexShard::DOC_KEY_LEN); + $doc_info[self::HASH] = substr($doc_key, + IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN); + // inlinks is the domain of the inlink + $doc_info[self::INLINKS] = substr($doc_key, + 2 * IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN); + $new_pages[$doc_key] = $doc_info; + if($doc_info[self::IS_DOC]) { + if(isset($need_docs[$hash_url])) { + unset($need_docs[$hash_url]); + } + } + if(!isset($this->grouped_keys[$hash_url])) { + /* + new url found in this block + */ + $this->current_block_hashes[] = $hash_url; + } + } + + $item_pages = array(); + if(is_array($need_docs)) { + $need_docs = array_unique($need_docs); + foreach($need_docs as $hash_url => $doc_key) { + $item = $this->lookupDoc($doc_key); + if($item != false) { + $item_pages[$hash_url] = $item; + } + } + } + + $new_pages = array_merge($new_pages, $item_pages); + + foreach($new_pages as $doc_key => $doc_info) { + $new_pages[$doc_key][self::SUMMARY_OFFSET] = array(); + $new_pages[$doc_key][self::SUMMARY_OFFSET][] = + array($doc_info["KEY"], $doc_info[self::GENERATION], + $doc_info[self::SUMMARY_OFFSET]); + } + + return $new_pages; + } + /** * For a collection of grouped pages generates a grouped summary for each * group and returns an array of out pages consisting diff --git a/locale/fr-FR/configure.ini b/locale/fr-FR/configure.ini index 1537e218f..60a7d9577 100755 --- a/locale/fr-FR/configure.ini +++ b/locale/fr-FR/configure.ini @@ -841,10 +841,10 @@ pagination_helper_next = "Proch." ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/layouts ; ; rss_layout.php line: 64 -rss_layout_title = "" +rss_layout_title = "Moteur de recherche PHP -Yioop! %s" ; ; rss_layout.php line: 72 -rss_layout_description = "" +rss_layout_description = "%s Résultats" ; ; web_layout.php line: 65 web_layout_title = "Moteur de recherche PHP -Yioop!" diff --git a/locale/ja/configure.ini b/locale/ja/configure.ini index 94576f7c3..57d27f49d 100755 --- a/locale/ja/configure.ini +++ b/locale/ja/configure.ini @@ -895,7 +895,7 @@ search_view_rank = "ランク:%s" search_view_relevancy = "関連:%s" ; ; search_view.php line: 139 -search_view_proximity = "" +search_view_proximity = "近さ: %s" ; ; search_view.php line: 141 search_view_score = "スコア %s" diff --git a/locale/vi-VN/configure.ini b/locale/vi-VN/configure.ini index 1540c2d15..6f390df52 100755 --- a/locale/vi-VN/configure.ini +++ b/locale/vi-VN/configure.ini @@ -889,13 +889,13 @@ search_view_calculated = "Đã tính toán trong %s giây search_view_results = "Cho kết quả tứ %s - %s của %s" ; ; search_view.php line: 135 -search_view_rank = "Thứ Tự:" +search_view_rank = "Thứ Tự: %s" ; ; search_view.php line: 137 -search_view_relevancy = "Thích hợp:" +search_view_relevancy = "Thích hợp: %s" ; ; search_view.php line: 139 -search_view_proximity = "" +search_view_proximity = "Gần: %s" ; ; search_view.php line: 141 search_view_score = "Điểm: %s" diff --git a/models/phrase_model.php b/models/phrase_model.php index 8a855ed24..264c8f45d 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -205,11 +205,16 @@ class PhraseModel extends Model * an attempt will be made to look up the results in either * the file cache or memcache. Otherwise, items will be recomputed * and then potentially restored in cache + * @param int $raw ($raw == 0) normal grouping, ($raw == 1) + * no grouping but page look-up for links, ($raw == 2) + * no grouping done on data + * * @return array an array of summary data */ function getPhrasePageResults( $input_phrase, $low = 0, $results_per_page = NUM_RESULTS_PER_PAGE, - $format = true, $filter = NULL, $use_cache_if_allowed = true) + $format = true, $filter = NULL, $use_cache_if_allowed = true, + $raw = 0) { if(QUERY_STATISTICS) { $indent= " "; @@ -328,7 +333,7 @@ class PhraseModel extends Model } $out_results = $this->getSummariesByHash($word_structs, - $low, $phrase_num, $filter, $use_cache_if_allowed); + $low, $phrase_num, $filter, $use_cache_if_allowed, $raw); if(isset($out_results['PAGES']) && count($out_results['PAGES']) != 0) { @@ -684,10 +689,14 @@ class PhraseModel extends Model * an attempt will be made to look up the results in either * the file cache or memcache. Otherwise, items will be recomputed * and then potentially restored in cache + * @param int $raw ($raw == 0) normal grouping, ($raw == 1) + * no grouping but page look-up for links, ($raw == 2) + * no grouping done on data + * * @return array document summaries */ function getSummariesByHash($word_structs, $limit, $num, &$filter, - $use_cache_if_allowed = true) + $use_cache_if_allowed = true, $raw = 0) { global $CACHE; @@ -730,7 +739,7 @@ class PhraseModel extends Model } } - $query_iterator = $this->getQueryIterator($word_structs, $filter); + $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw); $num_retrieved = 0; $pages = array(); @@ -748,7 +757,6 @@ class PhraseModel extends Model $num_retrieved++; } } - } usort($pages, "scoreOrderCallback"); @@ -797,10 +805,15 @@ class PhraseModel extends Model * INDEX_ARCHIVE -- an index_archive object to get results from * @param array &$filter an array of hashes of domains to filter from * results + * and then potentially restored in cache + * @param int $raw ($raw == 0) normal grouping, ($raw == 1) + * no grouping but page look-up for links, ($raw == 2) + * no grouping done on data + * * @return &object an iterator for iterating through results to the * query */ - function getQueryIterator($word_structs, &$filter) + function getQueryIterator($word_structs, &$filter, $raw = 0) { $iterators = array(); $total_iterators = 0; @@ -858,7 +871,17 @@ class PhraseModel extends Model $union_iterator = new UnionIterator($iterators); } - $group_iterator = new GroupIterator($union_iterator, $total_iterators); + $raw = intval($raw); + if ($raw == 2) { + $group_iterator = $union_iterator; + } else if ($raw == 1) { + + $group_iterator = + new GroupIterator($union_iterator, $total_iterators, true); + } else { + $group_iterator = + new GroupIterator($union_iterator, $total_iterators); + } return $group_iterator; }