diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index 49cb8accf..829eaa9a9 100644 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -135,9 +135,10 @@ interface CrawlConstants const RELEVANCE ='an'; const DUPLICATE ='ao'; const META_WORDS ='ap'; + const CACHE_PAGE_PARTITION = 'aq'; const NEEDS_OFFSET_FLAG = 0x7FFFFFFE; const DUPLICATE_FLAG = 0x7FFFFFFF; - const CACHE_PAGE_PARTITION = 'aq'; + } ?> diff --git a/lib/fetch_url.php b/lib/fetch_url.php index 738201f4f..37c05a632 100755 --- a/lib/fetch_url.php +++ b/lib/fetch_url.php @@ -169,7 +169,7 @@ class FetchUrl implements CrawlConstants if(isset($encoding_parts[1])) { $sites[$i][self::ENCODING] = mb_strtoupper(trim($encoding_parts[1])); - //hopefuly safe to trust encoding sent + //hopefully safe to trust encoding sent } } else { $sites[$i][self::ENCODING] = diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index 2d2e30bbc..30b42bd1b 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -90,6 +90,11 @@ class GroupIterator extends IndexBundleIterator */ var $grouped_keys; + /** + * the minimum number of pages to group from a block; + * this trumps $this->index_bundle_iterator->results_per_block + */ + const MIN_FIND_RESULTS_PER_BLOCK = 200; /** * Creates a group iterator with the given parameters. @@ -102,8 +107,9 @@ class GroupIterator extends IndexBundleIterator { $this->index_bundle_iterator = $index_bundle_iterator; $this->num_docs = $this->index_bundle_iterator->num_docs; - $this->results_per_block = - $this->index_bundle_iterator->results_per_block; + $this->results_per_block = max( + $this->index_bundle_iterator->results_per_block, + self::MIN_FIND_RESULTS_PER_BLOCK); $this->reset(); } @@ -120,6 +126,19 @@ class GroupIterator extends IndexBundleIterator $this->seen_docs_unfiltered = 0; } + /** + * Computes a relevancy score for a posting offset with respect to this + * iterator + * @param int $posting_offset an offset into word_docs to compute the + * relevance of + * @return float a relevancy score based on BM25F. + */ + function computeRelevance($posting_offset) + { + return $this->index_bundle_iterator->computeRelevance( + $posting_offset); + } + /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached @@ -128,9 +147,26 @@ class GroupIterator extends IndexBundleIterator */ function findDocsWithWord() { - $pages = - $this->index_bundle_iterator->currentDocsWithWord(); - + $pages = array(); + $count = 0; + $done = false; + do { + $new_pages = $this->index_bundle_iterator->currentDocsWithWord(); + if(!is_array($new_pages)) { + $done = true; + if(count($pages) == 0) { + $pages = -1; + } + } else { + $pages = array_merge($pages, $new_pages); + $count = count($pages); + } + if($count < $this->results_per_block && !$done) { + $this->index_bundle_iterator->advance(); + } else { + $done = true; + } + } while(!$done); $this->count_block_unfiltered = count($pages); if(!is_array($pages)) { return $pages; @@ -187,12 +223,17 @@ class GroupIterator extends IndexBundleIterator $this->getIndex(), true); $doc_array = $word_iterator->currentDocsWithWord(); if(is_array($doc_array) && count($doc_array) == 1) { + $relevance = $this->computeRelevance( + $word_iterator->current_offset); $keys = array_keys($doc_array); $key = $keys[0]; - if(!isset($doc_array[$key][self::DUPLICATE]) ) {; - $pre_out_pages[$hash_url][$key] = $doc_array[$key]; - $pre_out_pages[$hash_url][$key]['IS_PAGE'] = true; - $pre_out_pages[$hash_url][$key]['KEY'] = $key; + if(!isset($doc_array[$key][self::DUPLICATE]) ) { + $item = $doc_array[$key]; + $item[self::RELEVANCE] += $relevance; + $item[self::SCORE] += $relevance; + $item['IS_PAGE'] = true; + $item['KEY'] = $key; + array_unshift($pre_out_pages[$hash_url], $item); } else { /* Deduplication: @@ -288,9 +329,15 @@ class GroupIterator extends IndexBundleIterator list($key, $summary_offset) = $offset_array; $index = & $this->getIndex($key); $page = $index->getPage($summary_offset); + if($page == array()) {continue;} if(!isset($out_pages[$doc_key][self::SUMMARY])) { $out_pages[$doc_key][self::SUMMARY] = $page; } else if (isset($page[self::DESCRIPTION])) { + if(!isset($out_pages[$doc_key][ + self::SUMMARY][self::DESCRIPTION])) { + $out_pages[$doc_key][self::SUMMARY][ + self::DESCRIPTION] = ""; + } $out_pages[$doc_key][self::SUMMARY][self::DESCRIPTION].= " .. ".$page[self::DESCRIPTION]; } diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php index 3878121d7..7c5f7e174 100644 --- a/lib/index_bundle_iterators/index_bundle_iterator.php +++ b/lib/index_bundle_iterators/index_bundle_iterator.php @@ -88,6 +88,15 @@ abstract class IndexBundleIterator implements CrawlConstants */ const RESULTS_PER_BLOCK = 100; + /** + * Computes a relevancy score for a posting offset with respect to this + * iterator + * @param int $posting_offset an offset into word_docs to compute the + * relevance of + * @return float a relevancy score based on BM25F. + */ + abstract function computeRelevance($posting_offset); + /** * Returns the iterators to the first document block that it could iterate * over @@ -127,8 +136,6 @@ abstract class IndexBundleIterator implements CrawlConstants * Gets the current block of doc ids and score associated with the * this iterators word * - * @param bool $with_summaries specifies whether or not to return the - * summaries associated with the document * @return mixed doc ids and score if there are docs left, -1 otherwise */ function currentDocsWithWord() diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php index 31f18f3a8..427f0105a 100644 --- a/lib/index_bundle_iterators/intersect_iterator.php +++ b/lib/index_bundle_iterators/intersect_iterator.php @@ -119,8 +119,9 @@ class IntersectIterator extends IndexBundleIterator */ function reset() { - foreach($this->index_bundle_iterators as $iterator) { - $iterator->reset(); + for($i = 0; $i < $this->num_iterators; $i++) { + $this->index_bundle_iterators[$i]->setResultsPerBlock(1); + $this->index_bundle_iterators[$i]->reset(); } $this->seen_docs = 0; @@ -128,6 +129,23 @@ class IntersectIterator extends IndexBundleIterator } + /** + * Computes a relevancy score for a posting offset with respect to this + * iterator + * @param int $posting_offset an offset into word_docs to compute the + * relevance of + * @return float a relevancy score based on BM25F. + */ + function computeRelevance($posting_offset) + { + $relevance = 0; + for($i = 0; $i < $this->num_iterators; $i++) { + $relevance += $this->index_bundle_iterators[$i]->computeRelevance( + $posting_offset); + } + return $relevance; + } + /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached @@ -142,8 +160,23 @@ class IntersectIterator extends IndexBundleIterator if($status == -1) { return -1; } + //next we finish computing BM25F $docs = $this->index_bundle_iterators[0]->currentDocsWithWord(); - $this->count_block = count($docs); + + if(is_array($docs) && count($docs) == 1) { + //we get intersect docs one at a time so should be only one + $keys = array_keys($docs); + $key = $keys[0]; + for($i = 1; $i < $this->num_iterators; $i++) { + $i_docs = + $this->index_bundle_iterators[$i]->currentDocsWithWord(); + + $docs[$key][self::RELEVANCE] += $i_docs[$key][self::RELEVANCE]; + } + $docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] + + $docs[$key][self::RELEVANCE]; + } + $this->count_block = count($docs); $this->pages = $docs; return $docs; } @@ -161,7 +194,7 @@ class IntersectIterator extends IndexBundleIterator $this->index_bundle_iterators[ $i]->currentDocOffsetWithWord(); if($i == 0) { - $biggest_offset = $new_doc_offset[$i]; + $biggest_offset = $new_doc_offset[0]; } if($new_doc_offset[$i] == -1) { return -1; @@ -169,6 +202,8 @@ class IntersectIterator extends IndexBundleIterator if($new_doc_offset[$i] > $biggest_offset) { $biggest_offset = $new_doc_offset[$i]; $all_same = false; + } else if ($new_doc_offset[$i] < $biggest_offset) { + $all_same = false; } } if($all_same) { @@ -176,6 +211,7 @@ class IntersectIterator extends IndexBundleIterator } for($i = 0; $i < $this->num_iterators; $i++) { if($new_doc_offset[$i] < $biggest_offset) { + $this->index_bundle_iterators[$i]->advance($biggest_offset); } } @@ -205,7 +241,6 @@ class IntersectIterator extends IndexBundleIterator floor(($this->seen_docs * $total_num_docs) / $this->seen_docs_unfiltered); } - $this->index_bundle_iterators[0]->advance($doc_offset); } diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php index 36e492812..8a1361472 100644 --- a/lib/index_bundle_iterators/phrase_filter_iterator.php +++ b/lib/index_bundle_iterators/phrase_filter_iterator.php @@ -141,6 +141,19 @@ class PhraseFilterIterator extends IndexBundleIterator $doc_block = $this->currentDocsWithWord(); } + /** + * Computes a relevancy score for a posting offset with respect to this + * iterator + * @param int $posting_offset an offset into word_docs to compute the + * relevance of + * @return float a relevancy score based on BM25F. + */ + function computeRelevance($posting_offset) + { + return $this->index_bundle_iterator->computeRelevance( + $posting_offset); + } + /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php index a5cb0b6bd..4f4f008b1 100644 --- a/lib/index_bundle_iterators/union_iterator.php +++ b/lib/index_bundle_iterators/union_iterator.php @@ -131,6 +131,23 @@ class UnionIterator extends IndexBundleIterator } + /** + * Computes a relevancy score for a posting offset with respect to this + * iterator + * @param int $posting_offset an offset into word_docs to compute the + * relevance of + * @return float a relevancy score based on BM25F. + */ + function computeRelevance($posting_offset) + { + $relevance = 0; + for($i = 0; $i < $this->num_iterators; $i++) { + $relevance += $this->index_bundle_iterators[$i]->computeRelevance( + $posting_offset); + } + return $relevance; + } + /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php index 25e4846c7..9cf31b405 100644 --- a/lib/index_bundle_iterators/word_iterator.php +++ b/lib/index_bundle_iterators/word_iterator.php @@ -146,6 +146,21 @@ class WordIterator extends IndexBundleIterator } } + /** + * Computes a relevancy score for a posting offset with respect to this + * iterator + * @param int $posting_offset an offset into word_docs to compute the + * relevance of + * @return float a relevancy score based on BM25F. + */ + function computeRelevance($posting_offset) + { + $item = array(); + $this->index->getCurrentShard()->makeItem($item, + $this->start_offset, $posting_offset, $this->last_offset, 1); + return $item[self::RELEVANCE]; + } + /** * Returns the iterators to the first document block that it could iterate * over @@ -173,6 +188,7 @@ class WordIterator extends IndexBundleIterator $this->next_offset = $this->current_offset; //the next call also updates next offset $results = $this->index->getCurrentShard()->getPostingsSlice( + $this->start_offset, $this->next_offset, $this->last_offset, $this->results_per_block); return $results; } diff --git a/lib/index_shard.php b/lib/index_shard.php index 09b9c2f2e..2a52e33d6 100644 --- a/lib/index_shard.php +++ b/lib/index_shard.php @@ -341,89 +341,107 @@ class IndexShard extends PersistentStructure implements CrawlConstants * reference the value of $next_offset will point to the next record in * the list (if it exists) after the function is called. * + * @param int $start_offset of the current posting list for query term + * used in calculating BM25F. * @param int &$next_offset where to start in word docs * @param int $last_offset offset at which to stop by * @param int $len number of documents desired * @return array desired list of doc's and their info */ - function getPostingsSlice(&$next_offset, $last_offset, $len) + function getPostingsSlice($start_offset, &$next_offset, $last_offset, $len) { if(!$this->read_only_from_disk && !$this->word_docs_packed) { $this->packWordDocs(); } $num_docs_so_far = 0; - $num_doc_or_links = ($next_offset > 0) ? - ($last_offset - $next_offset) >> 2 - : 1; $results = array(); $end = min($this->word_docs_len, $last_offset); do { if($next_offset > $end) {break;} $item = array(); - $posting = $this->getWordDocsSubstring($next_offset, 4); - list($doc_index, $occurrences) = $this->unpackPosting($posting); + $doc_id = + $this->makeItem( + &$item, $start_offset, $next_offset, $last_offset); + $results[$doc_id] = $item; + $num_docs_so_far ++; + $old_next_offset = $next_offset; $next_offset += self::POSTING_LEN; - $doc_depth = log(10*(($doc_index +1) + - $this->generation_offset)*NUM_FETCHERS, 10); - $item[self::DOC_RANK] = number_format(11 - - $doc_depth, PRECISION); - $doc_loc = $doc_index << 4; - $doc_info_string = $this->getDocInfoSubstring($doc_loc, 12); - $doc_id = substr($doc_info_string, 0, 8); - $item[self::SUMMARY_OFFSET] = $this->unpackInt( - substr($doc_info_string, 8, 4)); - $is_doc = false; - $skip_stats = false; - - if($item[self::SUMMARY_OFFSET] == self::DUPLICATE_FLAG || - $item[self::SUMMARY_OFFSET] == self::NEEDS_OFFSET_FLAG) { - $skip_stats = true; - $item[self::DUPLICATE] = true; - } else if(($item[self::SUMMARY_OFFSET] - & self::COMPOSITE_ID_FLAG) !== 0) { - //handles link item case - $item[self::SUMMARY_OFFSET] ^= self::COMPOSITE_ID_FLAG; - $doc_loc += 12; - $doc_info_string = $this->getDocInfoSubstring($doc_loc, 16); - $doc_id .= ":". - substr($doc_info_string, 0, 8).":". - substr($doc_info_string, 8, 8); - $average_doc_len = ($this->num_link_docs != 0) ? - $this->len_all_link_docs/$this->num_link_docs : 0; - $num_docs = $this->num_link_docs; - } else { - $is_doc = true; - $average_doc_len = $this->len_all_docs/$this->num_docs; - $num_docs = $this->num_docs; - } - if(!$skip_stats) { - $tmp = unpack("N",$this->getDocInfoSubstring($doc_loc + 12, 4)); - $doc_len = $tmp[1]; - $doc_ratio = ($average_doc_len > 0) ? - $doc_len/$average_doc_len : 0; - $pre_relevance = number_format( - 3 * $occurrences/ - ($occurrences + .5 + 1.5* $doc_ratio), - PRECISION); - $num_term_occurrences = $num_doc_or_links * - $num_docs/($this->num_docs + $this->num_link_docs); - $IDF = ($num_docs - $num_term_occurrences + 0.5) / - ($num_term_occurrences + 0.5); - $item[self::RELEVANCE] = $IDF * $pre_relevance; - - $item[self::SCORE] = $item[self::DOC_RANK] + - .1/ ($item[self::RELEVANCE] + .1); - } - $results[$doc_id] = $item; - $num_docs_so_far ++; - } while ($next_offset<= $last_offset && $num_docs_so_far < $len && $next_offset > $old_next_offset); return $results; } + /** + * + */ + function makeItem(&$item, $start_offset, $current_offset, $last_offset, + $occurs = 0) + { + $num_doc_or_links = ($last_offset - $start_offset) >> 2; + + $posting = $this->getWordDocsSubstring($current_offset, 4); + list($doc_index, $occurrences) = $this->unpackPosting($posting); + if($occurrences < $occurs) { + $occurrences = $occurs; + } + $doc_depth = log(10*(($doc_index +1) + + $this->generation_offset)*NUM_FETCHERS, 10); + $item[self::DOC_RANK] = number_format(11 - + $doc_depth, PRECISION); + $doc_loc = $doc_index << 4; + $doc_info_string = $this->getDocInfoSubstring($doc_loc, 12); + $doc_id = substr($doc_info_string, 0, 8); + $item[self::SUMMARY_OFFSET] = $this->unpackInt( + substr($doc_info_string, 8, 4)); + $is_doc = false; + $skip_stats = false; + + if($item[self::SUMMARY_OFFSET] == self::DUPLICATE_FLAG || + $item[self::SUMMARY_OFFSET] == self::NEEDS_OFFSET_FLAG) { + $skip_stats = true; + $item[self::DUPLICATE] = true; + } else if(($item[self::SUMMARY_OFFSET] + & self::COMPOSITE_ID_FLAG) !== 0) { + //handles link item case + $item[self::SUMMARY_OFFSET] ^= self::COMPOSITE_ID_FLAG; + $doc_loc += 12; + $doc_info_string = $this->getDocInfoSubstring($doc_loc, 16); + $doc_id .= ":". + substr($doc_info_string, 0, 8).":". + substr($doc_info_string, 8, 8); + $average_doc_len = ($this->num_link_docs != 0) ? + $this->len_all_link_docs/$this->num_link_docs : 0; + $num_docs = $this->num_link_docs; + } else { + $is_doc = true; + $average_doc_len = $this->len_all_docs/$this->num_docs; + $num_docs = $this->num_docs; + } + + if(!$skip_stats) { + $doc_len = $this->unpackInt( + $this->getDocInfoSubstring($doc_loc + 12, 4)); + $doc_ratio = ($average_doc_len > 0) ? + $doc_len/$average_doc_len : 0; + $pre_relevance = number_format( + 3 * $occurrences/ + ($occurrences + .5 + 1.5* $doc_ratio), + PRECISION); + $num_term_occurrences = $num_doc_or_links * + $num_docs/($this->num_docs + $this->num_link_docs); + $IDF = ($num_docs - $num_term_occurrences + 0.5) / + ($num_term_occurrences + 0.5); + $item[self::RELEVANCE] = .05 * $IDF * $pre_relevance; + + $item[self::SCORE] = $item[self::DOC_RANK] + + + $item[self::RELEVANCE]; + } + return $doc_id; + + } + /** * */ @@ -495,7 +513,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants if(isset($this->words[$word_id])) { list($first_offset, $last_offset, $num_docs_or_links) = $this->getWordInfo($word_id, true); - $results = $this->getPostingsSlice($first_offset, $last_offset, $len); + $results = $this->getPostingsSlice($first_offset, + $first_offset, $last_offset, $len); } return $results; } diff --git a/models/model.php b/models/model.php index b95c49925..a0a72e7c8 100755 --- a/models/model.php +++ b/models/model.php @@ -107,7 +107,6 @@ class Model implements CrawlConstants { if(isset($results['PAGES'])) { $pages = $results['PAGES']; - $num_pages = count($pages); } else { $output['TOTAL_ROWS'] = 0; @@ -116,7 +115,9 @@ class Model implements CrawlConstants } for($i = 0; $i < $num_pages; $i++) { $page = $pages[$i]; - + if(!isset($page[self::TITLE])) { + $page[self::TITLE] = ""; + } $page[self::TITLE] = strip_tags($page[self::TITLE]); if(strlen($page[self::TITLE]) == 0 ) { @@ -134,7 +135,7 @@ class Model implements CrawlConstants substr(strip_tags($page[self::DESCRIPTION]), 0, $end_title). $ellipsis; //still no text revert to url - if(strlen($page[self::TITLE]) == 0) { + if(strlen($page[self::TITLE]) == 0 && isset($page[self::URL])) { $page[self::TITLE] = $page[self::URL]; } } diff --git a/models/phrase_model.php b/models/phrase_model.php index 5acd07ded..d404604c6 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -71,7 +71,11 @@ class PhraseModel extends Model */ var $index_name; - + /** + * Number of pages to cache in one go in memcache + * Size chosen based on 1MB max object size for memcache + */ + const NUM_CACHE_PAGES = 80; /** * {@inheritdoc} */ @@ -416,8 +420,10 @@ class PhraseModel extends Model $pages = array(); $generation = 0; - $to_retrieve = ceil(($limit+$num)/100) * 100; - $start_slice = floor(($limit)/100) * 100; + $to_retrieve = ceil(($limit+$num)/self::NUM_CACHE_PAGES) * + self::NUM_CACHE_PAGES; + $start_slice = floor(($limit)/self::NUM_CACHE_PAGES) * + self::NUM_CACHE_PAGES; if(USE_MEMCACHE) { $tmp = ""; foreach($word_structs as $word_struct) { @@ -448,7 +454,7 @@ class PhraseModel extends Model $pages = array_merge($pages, $gen_pages); $generation++; } - uasort($pages, "scoreOrderCallback"); + usort($pages, "scoreOrderCallback"); if($num_retrieved < $to_retrieve) { $results['TOTAL_ROWS'] = $num_retrieved; @@ -463,10 +469,9 @@ class PhraseModel extends Model $results['PAGES'] = & $pages; $results['PAGES'] = array_slice($results['PAGES'], $start_slice); if(USE_MEMCACHE) { - $MEMCACHE->set($summary_hash, $results); } - $results['PAGES'] = array_slice($results['PAGES'], $limit -$start_slice, + $results['PAGES'] = array_slice($results['PAGES'], $limit-$start_slice, $num); return $results; diff --git a/views/search_view.php b/views/search_view.php index 022da1865..3e15e5bea 100755 --- a/views/search_view.php +++ b/views/search_view.php @@ -116,7 +116,8 @@ class SearchView extends View implements CrawlConstants foreach($data['PAGES'] as $page) {?> <div class='result'> <h2> - <a href="<?php if($page[self::TYPE] != "link") { + <a href="<?php if(isset($page[self::TYPE]) + && $page[self::TYPE] != "link") { e($page[self::URL]); } else { e(strip_tags($page[self::TITLE])); @@ -126,23 +127,23 @@ class SearchView extends View implements CrawlConstants e($page[self::TITLE]); ?>" /> <?php } else { echo $page[self::TITLE]; - $this->filetypeHelper->render($page[self::TYPE]); + if(isset($page[self::TYPE])) { + $this->filetypeHelper->render($page[self::TYPE]); + } } ?></a></h2> <p><?php echo $page[self::DESCRIPTION]; ?></p> - <p class="echolink" ><?php - e(substr($page[self::URL],0, 200)." "); + <p class="echolink" ><?php if(isset($page[self::URL])){ + e(substr($page[self::URL],0, 200)." ");} e(tl('search_view_rank', number_format($page[self::DOC_RANK], 2))); $page["WEIGHT"] = (isset($page["WEIGHT"])) ? $page["WEIGHT"] : 1; e(tl('search_view_relevancy', - number_format((1.25*floatval($page[self::SCORE]) - - floatval($page[self::DOC_RANK])) - / $page["WEIGHT"] , 2) )); - e(tl('search_view_score', 1.25* $page[self::SCORE])); - if($page[self::TYPE] != "link") { + number_format($page[self::RELEVANCE], 2) )); + e(tl('search_view_score', $page[self::SCORE])); + if(isset($page[self::TYPE]) && $page[self::TYPE] != "link") { ?> <a href="?YIOOP_TOKEN=<?php e($data['YIOOP_TOKEN']); ?>&c=search&a=cache&q=<?php