diff --git a/INSTALL b/INSTALL index c48840026..ab3303363 100755 --- a/INSTALL +++ b/INSTALL @@ -52,10 +52,7 @@ php queue_server.php terminal from the bin folder. (3) You need at least one fetcher running -to download pages. You can set fetchers up either on the -same machine or on other machines. The QUEUE_SERVER -in config.php says the url of the server to get fetch batches -from. To run a fetcher, simply type: +to download pages.To run a fetcher, simply type: php fetcher.php terminal diff --git a/README b/README index 727bb3d1c..696efc756 100755 --- a/README +++ b/README @@ -58,7 +58,9 @@ Credits ------ Source code due to Chris Pollett. Several people helped with localization: Mary Pollett, Youn Kim, Sugi Widjaja, -Chao-Hsin Shih, Sujata Dongre, Jonathan Ben-David +Chao-Hsin Shih, Sujata Dongre, Jonathan Ben-David. +Thanks to Ravi Dhillon for finding and helping with the +fixes for Issue 15 and Commit 632e46. Installation ------------- diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index 9f5462532..fdaa1fe6e 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -379,7 +379,6 @@ class IndexArchiveBundle implements CrawlConstants { $words_array = array(); if(!is_array($word_keys) || count($word_keys) < 1) { return NULL;} - foreach($word_keys as $word_key) { $tmp = $this->getCurrentShard()->getWordInfo($word_key); if($tmp === false) { diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index e574d7276..93b49211c 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -102,6 +102,8 @@ class GroupIterator extends IndexBundleIterator { $this->index_bundle_iterator = $index_bundle_iterator; $this->num_docs = $this->index_bundle_iterator->num_docs; + $this->results_per_block = + $this->index_bundle_iterator->results_per_block; $this->reset(); } @@ -303,8 +305,10 @@ class GroupIterator extends IndexBundleIterator /** * Forwards the iterator one group of docs + * @param $doc_offset if set the next block must all have $doc_offsets + * larger than or equal to this value */ - function advance() + function advance($doc_offset = null) { $this->advanceSeenDocs(); @@ -323,10 +327,21 @@ class GroupIterator extends IndexBundleIterator $this->grouped_keys[$hash_url] = true; } - $this->index_bundle_iterator->advance(); + $this->index_bundle_iterator->advance($doc_offset); } + /** + * Gets the doc_offset for the next document that would be return by + * this iterator + * + * @return int the desired document offset + */ + function currentDocOffsetWithWord() { + $this->index_bundle_iterator->currentDocOffsetWithWord(); + } + + /** * Returns the index associated with this iterator * @return &object the index diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php index 5ab90e609..3878121d7 100644 --- a/lib/index_bundle_iterators/index_bundle_iterator.php +++ b/lib/index_bundle_iterators/index_bundle_iterator.php @@ -69,14 +69,24 @@ abstract class IndexBundleIterator implements CrawlConstants * @var array */ var $pages; - + /** * Says whether the value in $this->count_block is up to date * @var bool */ var $current_block_fresh; + /** + * Number of documents returned for each block (at most) + * @var int + */ + var $results_per_block = self::RESULTS_PER_BLOCK; + /** + * Default number of documents returned for each block (at most) + * @var int + */ + const RESULTS_PER_BLOCK = 100; /** * Returns the iterators to the first document block that it could iterate @@ -86,14 +96,25 @@ abstract class IndexBundleIterator implements CrawlConstants /** * Forwards the iterator one group of docs + * @param $doc_index if set the next block must all have $doc_indexes larger + * than this value */ - abstract function advance(); + abstract function advance($doc_index = null); + /** * Returns the index associated with this iterator * @return object the index */ abstract function &getIndex($key = NULL); + /** + * Gets the doc_offset for the next document that would be return by + * this iterator + * + * @return int the desired document offset + */ + abstract function currentDocOffsetWithWord(); + /** * Hook function used by currentDocsWithWord to return the current block * of docs if it is not cached @@ -164,11 +185,14 @@ abstract class IndexBundleIterator implements CrawlConstants /** * Get the current block of doc summaries for the word iterator and advances - * the current pointer to the next block + * the current pointer to the next blockof documents. If a doc index is + * the next block must be of docs after this doc_index * + * @param $doc_offset if set the next block must all have $doc_offsets + * equal to or larger than this value * @return array doc summaries matching the $this->restrict_phrases */ - function nextDocsWithWord() + function nextDocsWithWord($doc_offset = null) { $doc_block = $this->getSummariesFromCurrentDocs(); @@ -176,7 +200,7 @@ abstract class IndexBundleIterator implements CrawlConstants return NULL; } - $this->advance(); + $this->advance($doc_offset); return $doc_block; @@ -198,5 +222,16 @@ abstract class IndexBundleIterator implements CrawlConstants $this->seen_docs += $this->count_block; } + /** + * Sets the value of the result_per_block field. This field controls + * the maximum number of results that can be returned in one go by + * currentDocsWithWord() + * + * @param int $num the maximum number of results that can be returned by + * a block + */ + function setResultsPerBlock($num) { + $this->results_per_block = $num; + } } ?> diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php index d498461dd..62f36b691 100644 --- a/lib/index_bundle_iterators/intersect_iterator.php +++ b/lib/index_bundle_iterators/intersect_iterator.php @@ -66,12 +66,6 @@ class IntersectIterator extends IndexBundleIterator */ var $num_iterators; - /** - * The number of documents in the current block before filtering - * by restricted words - * @var int - */ - var $count_block_unfiltered; /** * The number of documents in the current block after filtering * by restricted words @@ -104,16 +98,20 @@ class IntersectIterator extends IndexBundleIterator $this->num_iterators = count($index_bundle_iterators); $this->num_docs = -1; + $this->results_per_block = 1; /* the most results we can return is the size of the least num_docs - of what we are itrerating over + of what we are iterating over. We are also setting up here + that we return at most one posting at a time from each + iterator */ for($i = 0; $i < $this->num_iterators; $i++) { if( $this->num_docs < 0 || $this->index_bundle_iterators[$i]->num_docs < $this->num_docs) { $this->num_docs = $this->index_bundle_iterators[$i]->num_docs; } + $this->index_bundle_iterators[$i]->setResultsPerBlock(1); } $this->reset(); } @@ -130,7 +128,6 @@ class IntersectIterator extends IndexBundleIterator $this->seen_docs = 0; $this->seen_docs_unfiltered = 0; - $doc_block = $this->currentDocsWithWord(); } @@ -143,134 +140,89 @@ class IntersectIterator extends IndexBundleIterator function findDocsWithWord() { $pages = array(); - $high_ranks = array(); - $last = $this->num_iterators - 1; - for($i = 0; $i < $this->num_iterators; $i++) { - $pages[$i] = - $this->index_bundle_iterators[$i]->currentDocsWithWord(); - if(!is_array($pages[$i]) || count($pages[$i]) == 0) { - $this->to_advance_index = $i; - return $pages[$i]; - } - list($low_ranks[$i], $high_ranks[$i]) = - $this->lowHighRanks($pages[$i], $i); - } - uasort($low_ranks, "docRankOrderCallback"); - - $low_ranks = array_values($low_ranks); - - $low_rank = $low_ranks[$last][self::DOC_RANK]; - - $this->to_advance_index = $low_ranks[0]["INDEX"]; - $this->count_block_unfiltered = count($pages[$this->to_advance_index]); - - $docs = array(); - $looping = true; - - while ($looping == true) { - for($i = 0; $i <= $last; $i++) { - list( ,$high_ranks[$i]) = - $this->lowHighRanks($pages[$i], $i, false); - } - $broke = false; - $score = 0; - $high_rank = $high_ranks[0][self::DOC_RANK]; - $high_key = $high_ranks[0]["KEY"]; - $high_index = $high_ranks[0]["INDEX"]; - $to_deletes = array(); - for($i = 1; $i <= $last; $i++) { - if($high_ranks[$i][self::DOC_RANK] < $low_rank ) { - $looping = false; - break 2; - } - if($high_ranks[$i][self::DOC_RANK] > $high_rank || - ($high_ranks[$i][self::DOC_RANK] == $high_rank && - strcmp($high_ranks[$i]["KEY"], $high_key) > 0) - ) { - $broke = true; - $high_rank = $high_ranks[$i][self::DOC_RANK]; - $high_index = $high_ranks[$i]["INDEX"]; - $high_key = $high_ranks[$i]["KEY"]; - $to_deletes[$high_index] = $high_key; - } - $score += $high_ranks[$i][self::SCORE]; - } - if($broke == false) { - $docs[$high_key] = $pages[$high_index][$high_key]; - $docs[$high_key][self::SCORE] = $score; - $to_deletes[$high_index] = $high_key; - } - - foreach($to_deletes as $index => $key) { - unset($pages[$index][$key]); - if(count($pages[$index]) == 0) { - $looping = false; - } - } + $status = $this->syncDocOffsetsAmongstIterators(); + if($status == -1) { + return -1; } + $docs = $this->index_bundle_iterators[0]->currentDocsWithWord(); $this->count_block = count($docs); $this->pages = $docs; return $docs; } /** - * Given a collection of documents, returns info about the low and high - * ranking documents. Namely, their ranks, keys, - * index in word iterator array, and scores * - * @param array &$docs documents to get low high info from - * @param int $index which word iterator these docs came from - * @param boo $sort_flag whether to sort the docs (if true) or to assume - * the docs are already sorted by rank - * @return array desired info */ - function lowHighRanks(&$docs, $index, $sort_flag = true) + function syncDocOffsetsAmongstIterators() { - if($sort_flag == true) { - uasort($docs, "docRankOrderCallback"); - } - reset($docs); - $high = array(); - $high["KEY"] = key($docs); - $high[self::DOC_RANK] = $docs[$high["KEY"]][self::DOC_RANK]; - $high[self::SCORE] = $docs[$high["KEY"]][self::SCORE]; - $high["INDEX"] = $index; - end($docs); - $low = array(); - $low["KEY"] = key($docs); - $low[self::DOC_RANK] = $docs[$low["KEY"]][self::DOC_RANK]; - $low[self::SCORE] = $docs[$low["KEY"]][self::SCORE]; - $low["INDEX"] = $index; - return array($low, $high); + $biggest_offset = 0; + $all_same = true; + do{ + for($i = 0; $i < $this->num_iterators; $i++) { + $new_doc_offset = + $this->index_bundle_iterators[$i]->currentDocOffsetWithWord(); + if($i == 0) { + $biggest_offset = $new_doc_offset; + } + if($new_doc_offset == -1) { + return -1; + } + if($new_doc_offset > $biggest_offset) { + $biggest_offset = $new_doc_offset; + $all_same = false; + } + } + if($all_same) { + return 1; + } + for($i = 0; $i < $this->num_iterators; $i++) { + $this->index_bundle_iterators[$i]->advance($biggest_offset); + } + } while(!$all_same); } /** * Forwards the iterator one group of docs + * @param $doc_offset if set the next block must all have $doc_offsets + * larger than or equal to this value */ - function advance() + function advance($doc_offset = null) { $this->advanceSeenDocs(); - $this->seen_docs_unfiltered += $this->count_block_unfiltered; + $this->seen_docs_unfiltered = 0; - $min_num_docs = 10000000000; + //num_docs can change when advance() called so that's why we recompute + $total_num_docs = 0; for($i = 0; $i < $this->num_iterators; $i++) { - if($this->index_bundle_iterators[$i]->num_docs < $min_num_docs) { - $min_num_docs = $this->index_bundle_iterators[$i]->num_docs; - } + $this->seen_docs_unfiltered += + $this->index_bundle_iterators[$i]->seen_docs; + $total_num_docs = $this->index_bundle_iterators[$i]->num_docs; } if($this->seen_docs_unfiltered > 0) { $this->num_docs = - floor(($this->seen_docs * $min_num_docs) / + floor(($this->seen_docs * $total_num_docs) / $this->seen_docs_unfiltered); } else { $this->num_docs = 0; } - $this->index_bundle_iterators[$this->to_advance_index]->advance(); + + $this->index_bundle_iterators[0]->advance($doc_offset); } + /** + * Gets the doc_offset for the next document that would be return by + * this iterator + * + * @return int the desired document offset + */ + function currentDocOffsetWithWord() { + $this->syncDocOffsetsAmongstIterators(); + $this->index_bundle_iterators[0]->currentDocOffsetWithWord(); + } + /** * Returns the index associated with this iterator * @return object the index @@ -279,5 +231,23 @@ class IntersectIterator extends IndexBundleIterator { return $this->index_bundle_iterators[0]->getIndex($key = NULL); } + + /** + * This method is supposed to set + * the value of the result_per_block field. This field controls + * the maximum number of results that can be returned in one go by + * currentDocsWithWord(). This method cannot be consistently + * implemented for this iterator and expect it to behave nicely + * it this iterator is used together with union_iterator. So + * to prevent a user for doing this, calling this method results + * in a user defined error + * + * @param int $num the maximum number of results that can be returned by + * a block + */ + function setResultsPerBlock($num) { + trigger_error("Cannot set the results per block of + an intersect iterator", E_USER_ERROR); + } } ?> diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php index fc7fa5fc6..36e492812 100644 --- a/lib/index_bundle_iterators/phrase_filter_iterator.php +++ b/lib/index_bundle_iterators/phrase_filter_iterator.php @@ -122,6 +122,8 @@ class PhraseFilterIterator extends IndexBundleIterator $this->restrict_phrases = $restrict_phrases; $this->disallow_phrases = $disallow_phrases; $this->num_docs = $this->index_bundle_iterator->num_docs; + $this->results_per_block = + $this->index_bundle_iterator->results_per_block; $this->weight = $weight; $this->current_block_fresh = false; $this->reset(); @@ -249,8 +251,11 @@ class PhraseFilterIterator extends IndexBundleIterator /** * Forwards the iterator one group of docs + * @param $doc_offset if set the next block must all have $doc_offsets + * larger than or equal to this value */ - function advance() + function advance($doc_offset = null) + { $this->advanceSeenDocs(); @@ -267,7 +272,17 @@ class PhraseFilterIterator extends IndexBundleIterator $this->num_docs = 0; } - $this->index_bundle_iterator->advance(); + $this->index_bundle_iterator->advance($doc_offset); + } + + /** + * Gets the doc_offset for the next document that would be return by + * this iterator + * + * @return int the desired document offset + */ + function currentDocOffsetWithWord() { + $this->index_bundle_iterator->currentDocOffsetWithWord(); } /** diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php index 3e5916f5e..a5cb0b6bd 100644 --- a/lib/index_bundle_iterators/union_iterator.php +++ b/lib/index_bundle_iterators/union_iterator.php @@ -102,8 +102,16 @@ class UnionIterator extends IndexBundleIterator */ $this->num_iterators = count($index_bundle_iterators); $this->num_docs = 0; + $this->results_per_block = 0; for($i = 0; $i < $this->num_iterators; $i++) { $this->num_docs += $this->index_bundle_iterators[$i]->num_docs; + /* + result_per_block is at most the sum of + results_per_block of things we are iterating. Value + is already init'd in base class. + */ + $this->results_per_block += + $this->index_bundle_iterators[$i]->results_per_block; } $this->reset(); } @@ -195,8 +203,10 @@ class UnionIterator extends IndexBundleIterator /** * Forwards the iterator one group of docs + * @param $doc_offset if set the next block must all have $doc_offsets + * larger than or equal to this value */ - function advance() + function advance($doc_offset = null) { $this->advanceSeenDocs(); @@ -205,7 +215,7 @@ class UnionIterator extends IndexBundleIterator $total_num_docs = 0; for($i = 0; $i < $this->num_iterators; $i++) { $total_num_docs += $this->index_bundle_iterators[$i]->num_docs; - $this->index_bundle_iterators[$i]->advance(); + $this->index_bundle_iterators[$i]->advance($doc_index); } if($this->seen_docs_unfiltered > 0) { $this->num_docs = @@ -238,5 +248,38 @@ class UnionIterator extends IndexBundleIterator return $this->index_bundle_iterators[0]->getIndex($key); } } + + /** + * This method is supposed to set + * the value of the result_per_block field. This field controls + * the maximum number of results that can be returned in one go by + * currentDocsWithWord(). This method cannot be consistently + * implemented for this iterator and expect it to behave nicely + * it this iterator is used together with intersect_iterator. So + * to prevent a user for doing this, calling this method results + * in a user defined error + * + * @param int $num the maximum number of results that can be returned by + * a block + */ + function setResultsPerBlock($num) { + trigger_error("Cannot set the results per block of + a union iterator", E_USER_ERROR); + } + + /** + * This method is supposed to + * get the doc_offset for the next document that would be return by + * this iterator. As the union iterator as written returns a block + * of size at least the number of iterators in it, and this iterator + * is intended to be used when results_per_block is 1, we generate + * a user defined error. + * + * @return int the desired document offset + */ + function currentDocOffsetWithWord() { + trigger_error("Cannot get the doc offset with word of + a union iterator", E_USER_ERROR); + } } ?> diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php index 9a00f8cfb..42db86f03 100644 --- a/lib/index_bundle_iterators/word_iterator.php +++ b/lib/index_bundle_iterators/word_iterator.php @@ -96,6 +96,12 @@ class WordIterator extends IndexBundleIterator */ var $current_offset; + /** + * Starting Offset of word occurence in the IndexShard + * @var int + */ + var $start_offset; + /** * Last Offset of word occurence in the IndexShard * @var int @@ -109,11 +115,8 @@ class WordIterator extends IndexBundleIterator */ var $empty; - /** - * Number of documents returned for each block (at most) - * @var int - */ - const RESULTS_PER_BLOCK = 100; + + /** * Creates a word iterator with the given parameters. @@ -127,15 +130,16 @@ class WordIterator extends IndexBundleIterator function __construct($word_key, $index, $raw = false) { $this->word_key = $word_key; - $this->index = $index; $this->current_block_fresh = false; + $tmp = $index->getCurrentShard()->getWordInfo($word_key, $raw); if ($tmp === false) { $this->empty = true; } else { - list($this->current_offset, $this->last_offset, $this->num_docs) + list($this->start_offset, $this->last_offset, $this->num_docs) = $tmp; + $this->current_offset = $this->start_offset; $this->empty = false; $this->reset(); @@ -167,27 +171,51 @@ class WordIterator extends IndexBundleIterator return -1; } $this->next_offset = $this->current_offset; - $results = $this->index->getCurrentShard()->getWordSlice( - $this->next_offset, $this->last_offset, self::RESULTS_PER_BLOCK); + $results = $this->index->getCurrentShard()->getPostingsSlice( + $this->next_offset, $this->last_offset, $this->results_per_block); return $results; } /** * Forwards the iterator one group of docs + * @param $doc_offset if set the next block must all have $doc_offsets + * larger than or equal to this value */ - function advance() + function advance($doc_offset = null) { $this->advanceSeenDocs(); if($this->current_offset < $this->next_offset) { $this->current_offset = $this->next_offset; + if($doc_offset !== null) { + $this->current_offset = + $this->index->getCurrentShard( + )->nextPostingOffsetDocOffset($this->next_offset, + $this->last_offset, $doc_offset); + $this->seen_docs = + ($this->current_offset - $this->start_offset)/ + IndexShard::POSTING_LEN; + } } else { $this->current_offset = $this->last_offset + 1; } - + } + + /** + * Gets the doc_offset for the next document that would be return by + * this iterator + * + * @return int the desired document offset + */ + function currentDocOffsetWithWord() { + if($this->current_offset > $this->last_offset) { + return -1; + } + return $this->index->getCurrentShard( + )->docOffsetFromPostingOffset($this->current_offset); } - + /** * Returns the index associated with this iterator * @return &object the index diff --git a/lib/index_shard.php b/lib/index_shard.php index 40122dfa5..188fb45a1 100644 --- a/lib/index_shard.php +++ b/lib/index_shard.php @@ -96,39 +96,27 @@ class IndexShard extends PersistentStructure implements CrawlConstants * * @var array */ - var $firsts; - - /** - * - * @var int - */ - var $firsts_len; - - /** - * - * @var array - */ - var $seconds; + var $words; /** + * Stores length of the words array in the shard on disk. Only set if + * we're in $read_only_from_disk mode * * @var int */ - var $seconds_len; + var $words_len; /** * * @var array */ - var $words; + var $prefixes; /** - * Stores length of the words array in the shard on disk. Only set if - * we're in $read_only_from_disk mode * * @var int */ - var $words_len; + var $prefixes_len; /** * This is supposed to hold the number of documents that have been stored @@ -198,17 +186,17 @@ class IndexShard extends PersistentStructure implements CrawlConstants /** * Header Length of an IndexShard (sum of its non-variable length fields) */ - const HEADER_LENGTH = 40; + const HEADER_LENGTH = 36; /** * Length of a Word entry in bytes in the shard */ - const WORD_ITEM_LEN = 14; + const WORD_ITEM_LEN = 16; /** - * Length of a doc offset occurrence pair in a posting list + * Length of one posting ( a doc offset occurrence pair) in a posting list */ - const DOC_OCCURRENCES_LEN = 4; + const POSTING_LEN = 4; /** * Makes an index shard with the given file name and generation offset @@ -227,10 +215,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants parent::__construct($fname, -1); $this->generation_offset = $generation_offset; $this->word_docs = ""; - $this->firsts_len = 0; - $this->firsts = array(); - $this->seconds_len = 0; - $this->seconds = array(); $this->words_len = 0; $this->word_docs_len = 0; $this->words = array(); @@ -290,16 +274,13 @@ class IndexShard extends PersistentStructure implements CrawlConstants } foreach($word_counts as $word => $occurrences) { $word_id = crawlHash($word, true); - $first = $word_id[0]; - $second = $word_id[1]; - $rest_id = substr($word_id, 2); $occurrences = ($occurrences > 255 ) ? 255 : $occurrences & 255; - $store = pack("N", ($this->docids_len << 4) + $occurrences); - if(!isset($this->words[$first][$second][$rest_id])) { - $this->words[$first][$second][$rest_id] = $store; - } else if($this->words[$first][$second][$rest_id] != + $store = $this->packPosting($this->docids_len >> 4, $occurrences); + if(!isset($this->words[$word_id])) { + $this->words[$word_id] = $store; + } else if($this->words[$word_id] != pack("N", self::DUPLICATE_FLAG)) { - $this->words[$first][$second][$rest_id] .= $store; + $this->words[$word_id] .= $store; } if($occurrences > 0) { if($is_doc == true) { @@ -308,7 +289,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants $link_doc_len += $occurrences; } } - $this->word_docs_len += self::DOC_OCCURRENCES_LEN; + $this->word_docs_len += self::POSTING_LEN; } $this->len_all_docs += $doc_len; @@ -333,7 +314,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants */ function getWordInfo($word_id, $raw = false) { - if($raw == false) { //get rid of out modfied base64 encoding $hash = str_replace("_", "/", $word_id); @@ -359,14 +339,14 @@ class IndexShard extends PersistentStructure implements CrawlConstants * the list stops if an offset larger than $last_offset is seen or * $len many doc's have been returned. Since $next_offset is passed by * reference the value of $next_offset will point to the next record in - * the list (if it exists) after thhe function is called. + * the list (if it exists) after the function is called. * * @param int &$next_offset where to start in word docs * @param int $last_offset offset at which to stop by * @param int $len number of documents desired * @return array desired list of doc's and their info */ - function getWordSlice(&$next_offset, $last_offset, $len) + function getPostingsSlice(&$next_offset, $last_offset, $len) { if(!$this->read_only_from_disk && !$this->word_docs_packed) { $this->packWordDocs(); @@ -379,11 +359,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants do { if($next_offset >= $this->word_docs_len) {break;} $item = array(); - $doc_string = $this->getWordDocsSubstring($next_offset, 4); - $tmp = unpack("N", $doc_string); - $doc_int = $tmp[1]; - $occurrences = $doc_int & 255; - $doc_index = ($doc_int >> 8); + $posting = $this->getWordDocsSubstring($next_offset, 4); + list($doc_index, $occurrences) = $this->unpackPosting($posting); $old_next_offset = $next_offset; $next_offset += 4; $doc_depth = log(10*(($doc_index +1) + @@ -444,26 +421,78 @@ class IndexShard extends PersistentStructure implements CrawlConstants return $results; } + /** + * + */ + function nextPostingOffsetDocOffset($start_offset, $end_offset, + $doc_offset) { + + $doc_index = $doc_offset >> 4; + $current = floor($start_offset/self::POSTING_LEN); + $end = floor($end_offset/self::POSTING_LEN); + $low = $current; + $high = $end; + $stride = 1; + $gallop_phase = true; + + do { + $posting = $this->getWordDocsSubstring($current*self::POSTING_LEN, + self::POSTING_LEN); + list($post_doc_index, ) = $this->unpackPosting($posting); + if($doc_index == $post_doc_index) { + return $current * self::POSTING_LEN; + } else if($doc_index < $post_doc_index) { + if($low == $current) { + return $current * self::POSTING_LEN; + } else if($gallop_phase) { + $gallop_phase = false; + } + $high = $current; + $current = (($low + $high) >> 1); + } else { + $low = $current; + if($gallop_phase) { + $current += $stride; + $stride <<= 1; + } else { + if($current + 1 == $high) { + $current++; + $low = $current; + } + $current = (($low + $high) >> 1); + } + } + + } while($current <= $end); + + return false; + } + + /** + * + */ + function docOffsetFromPostingOffset($offset) { + $posting = $this->getWordDocsSubstring($offset, self::POSTING_LEN); + list($doc_index, ) = $this->unpackPosting($posting); + return ($doc_index << 4); + } /** * Returns $len many documents which contained the word corresponding to - * $word_id + * $word_id (only wordk for loaded shards) * * @param string $word_id key to look up documents for * @param int number of documents desired back (from start of word linked * list). * @return array desired list of doc's and their info */ - function getWordSliceById($word_id, $len) + function getPostingsSliceById($word_id, $len) { $results = array(); - $first = $word_id[0]; - $second = $word_id[1]; - $rest_id = substr($word_id, 2); - if(isset($this->words[$first][$second][$rest_id])) { + if(isset($this->words[$word_id])) { list($first_offset, $last_offset, $num_docs_or_links) = $this->getWordInfo($word_id, true); - $results = $this->getWordSlice($first_offset, $last_offset, $len); + $results = $this->getPostingsSlice($first_offset, $last_offset, $len); } return $results; } @@ -476,7 +505,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants */ function appendIndexShard(&$index_shard) { - if($this->word_docs_packed == true) { $this->unpackWordDocs(); } @@ -485,37 +513,32 @@ class IndexShard extends PersistentStructure implements CrawlConstants } $this->doc_infos .= $index_shard->doc_infos; - foreach($index_shard->words as $first => $rest) { - foreach($rest as $second => $second_rest) { - foreach($second_rest as $rest_id => $postings) { + foreach($index_shard->words as $word_id => $postings) { $postings_len = strlen($postings); // update doc offsets for newly added docs - for($i = 0; $i < $postings_len; $i +=4) { - $doc_occurrences_string = substr($postings, $i, 4); - $tmp = unpack("N", $doc_occurrences_string); - $num = $tmp[1]; + for($i = 0; $i < $postings_len; $i += self::POSTING_LEN) { + $num = $this->unpackInt(substr($postings, $i, 4)); if($num != self::DUPLICATE_FLAG) { $num += ($this->docids_len << 4); - $doc_occurrences_string = pack("N", $num); - charCopy($doc_occurrences_string, $postings, $i, 4); + charCopy(pack("N", $num), $postings, $i, 4); } } $dup = pack("N", self::DUPLICATE_FLAG); - if(!isset($this->words[$first][$second][$rest_id])) { - $this->words[$first][$second][$rest_id] = $postings; + if(!isset($this->words[$word_id])) { + $this->words[$word_id] = $postings; $this->word_docs_len += $postings_len; - } else if($this->words[$first][$second][$rest_id] == $dup + } else if($this->words[$word_id] == $dup || $postings == $dup) { $old_word_docs_len = strlen( - $this->words[$first][$second][$rest_id]); - $this->words[$first][$second][$rest_id] = $dup; + $this->words[$word_id]); + $this->words[$word_id] = $dup; $this->word_docs_len -= $old_word_docs_len; $this->word_docs_len += strlen($dup); } else { - $this->words[$first][$second][$rest_id] .= $postings; + $this->words[$word_id] .= $postings; $this->word_docs_len += $postings_len; } - }}} + } $this->docids_len += $index_shard->docids_len; $this->num_docs += $index_shard->num_docs; @@ -575,11 +598,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants $doc_key = crawlHash($duplicate, true); $this->doc_infos .= $doc_key . pack("N", self::DUPLICATE_FLAG). pack("N", 0xFFFFFFFF); - $word_key = crawlHash("info:".$duplicate, true); - $first = $word_key[0]; - $second = $word_key[1]; - $rest_id = substr($word_key, 2); - $this->words[$first][$second][$rest_id] = + $word_id = crawlHash("info:".$duplicate, true); + $this->words[$word_id] = pack("N", $this->docids_len); $this->docids_len += 16; } @@ -591,9 +611,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants */ public function save() { - $this->computeFirstsSeconds(); - $header = pack("N", $this->firsts_len) . - pack("N", $this->seconds_len) . + $this->prepareWordsAndPrefixes(); + $header = pack("N", $this->prefixes_len) . pack("N", $this->words_len) . pack("N", $this->word_docs_len) . pack("N", $this->docids_len) . @@ -604,64 +623,65 @@ class IndexShard extends PersistentStructure implements CrawlConstants pack("N", $this->len_all_link_docs); $fh = fopen($this->filename, "wb"); fwrite($fh, $header); + fwrite($fh, $this->prefixes); $this->packWordDocs($fh); fwrite($fh, $this->word_docs); fwrite($fh, $this->doc_infos); fclose($fh); } - function computeFirstsSeconds() + /** + * + */ + function prepareWordsAndPrefixes() { - $this->firsts_len = 0; - $this->seconds_len = 0; - $this->words_len = 0; + $this->words_len = count($this->words) * IndexShard::WORD_ITEM_LEN; + ksort($this->words, SORT_STRING); + $blank = pack("N", 0xFFFFFFFF).pack("N", 0xFFFFFFFF); + $tmp = array(); + $offset = 0; + $num_words = 0; + $old_prefix = false; + $word_item_len = IndexShard::WORD_ITEM_LEN; foreach($this->words as $first => $rest) { - $this->firsts_len += 4; - $len = count($rest) << 2; - $this->firsts[$first] = $len; - foreach($rest as $second => $words) { - $third = count($this->words[$first][$second]) * - IndexShard::WORD_ITEM_LEN; - $this->seconds[$first][$second] = $third; - $this->words_len += $third; + $prefix = (ord($first[0]) << 8) + ord($first[1]); + if($old_prefix === $prefix) { + $num_words++; + } else { + if($old_prefix !== false) { + $tmp[$old_prefix] = pack("N", $offset) . + pack("N", $num_words); + $offset += $num_words * $word_item_len; + } + $old_prefix = $prefix; + $num_words = 1; + } + } + $tmp[$old_prefix] = pack("N", $offset) . pack("N", $num_words); + $num_prefixes = 2 << 16; + $this->prefixes = ""; + for($i = 0; $i < $num_prefixes; $i++) { + if(isset($tmp[$i])) { + $this->prefixes .= $tmp[$i]; + } else { + $this->prefixes .= $blank; } - $this->seconds_len += $len; } + $this->prefixes_len = strlen($this->prefixes); } function packWordDocs($fh = null) { - if($fh == null) { - $this->computeFirstsSeconds(); - } - $this->word_docs = ""; - $this->word_docs_len = 0; - if($fh != null) { - array_walk($this->firsts, function (&$value, $key, &$fh) { - $out = pack("N", (ord($key) << 24) + $value); - fwrite($fh, $out); - }, $fh); - - array_walk_recursive($this->seconds, function (&$value, $key, &$fh){ - $out = pack("N", (ord($key) << 24) + $value); - fwrite($fh, $out); - }, $fh); - } $this->word_docs_len = 0; $this->word_docs = ""; - foreach($this->words as $first => $seconds) { - foreach($seconds as $second => $rest) { - ksort($rest); // write out sorted, so can binary search on disk - foreach($rest as $rest_id => $postings) { - $len = strlen($postings); - $out = pack("N", $this->word_docs_len).pack("N", $len); - $this->word_docs .= $postings; - $this->word_docs_len += $len; - $this->words[$first][$second][$rest_id] = $out; - if($fh != null) { - fwrite($fh, $rest_id . $out); - } - } + foreach($this->words as $word_id => $postings) { + $len = strlen($postings); + $out = pack("N", $this->word_docs_len).pack("N", $len); + $this->word_docs .= $postings; + $this->word_docs_len += $len; + $this->words[$word_id] = $out; + if($fh != null) { + fwrite($fh, $word_id . $out); } } $this->word_docs_packed = true; @@ -674,20 +694,34 @@ class IndexShard extends PersistentStructure implements CrawlConstants */ function unpackWordDocs() { - foreach($this->words as $first => $seconds) { - foreach($seconds as $second => $rest) { - foreach($rest as $rest_id => $postings_info) { - $offset = $this->unpackInt(substr($postings_info, 0, 4)); - $len = $this->unpackInt(substr($postings_info, 4, 4)); - $postings = substr($this->word_docs, $offset, $len); - $this->words[$first][$second][$rest_id] = $postings; - } - } + foreach($this->words as $word_id => $postings_info) { + $offset = $this->unpackInt(substr($postings_info, 0, 4)); + $len = $this->unpackInt(substr($postings_info, 4, 4)); + $postings = substr($this->word_docs, $offset, $len); + $this->words[$word_id] = $postings; } unset($this->word_docs); $this->word_docs_packed = false; } + /** + * + */ + function packPosting($doc_index, $occurrences) + { + return pack("N", ($doc_index << 8) + $occurrences); + } + + /** + * + */ + function unpackPosting($posting) + { + $doc_int = $this->unpackInt($posting); + $occurrences = $doc_int & 255; + $doc_index = ($doc_int >> 8); + return array($doc_index, $occurrences); + } /** * Returns the first offset, last offset, and number of documents the @@ -704,57 +738,36 @@ class IndexShard extends PersistentStructure implements CrawlConstants { $this->getShardHeader(); $word_item_len = self::WORD_ITEM_LEN; - $first = $word_id[0]; - $second = $word_id[1]; - if(!isset($this->firsts) || $this->firsts == null || - count($this->firsts) == 0) { - /* if firsts not read in yet assume seconds not as well - seconds is about 256k, so hope memcache is active - */ - $firsts = $this->getShardSubstring(self::HEADER_LENGTH, - $this->firsts_len); - $seconds = $this->getShardSubstring(self::HEADER_LENGTH + - $this->firsts_len, - $this->seconds_len); - $this->unpackFirstSeconds($firsts, $seconds); - unset($firsts); - unset($seconds); + $prefix = (ord($word_id[0]) << 8) + ord($word_id[1]); + $prefix_info = $this->getShardSubstring( + self::HEADER_LENGTH + 8*$prefix, 8); + $blank = pack("N", 0xFFFFFFFF).pack("N", 0xFFFFFFFF); + if($prefix_info == $blank) { + return false; } + $offset = $this->unpackInt(substr($prefix_info, 0, 4)); - $start = self::HEADER_LENGTH + $this->firsts_len + - $this->seconds_len; - $high = 0; - foreach($this->seconds as $first_let => $seconds) { - foreach($seconds as $second_let => $third_len) { - if($first_let == $first && $second_let == $second) { - $high = floor($third_len/$word_item_len) - 1; - break 2; - } - $start += $third_len; - } - } + $high = $this->unpackInt(substr($prefix_info, 4, 4)) - 1; + $start = self::HEADER_LENGTH + $this->prefixes_len + $offset; $low = 0; - - $check_loc = ($low + $high >> 1); - + $check_loc = (($low + $high) >> 1); do { $old_check_loc = $check_loc; $word_string = $this->getShardSubstring($start + - $word_item_len +$check_loc * $word_item_len, - $word_item_len); + $check_loc * $word_item_len, $word_item_len); if($word_string == false) {return false;} - $word_string = $this->getShardSubstring($start - +$check_loc * $word_item_len, - $word_item_len); - $id = substr($word_string, 0, 6); - $cmp = strcmp($word_id, $first.$second.$id); + $id = substr($word_string, 0, 8); + $cmp = strcmp($word_id, $id); if($cmp === 0) { - return $this->getWordInfoFromString(substr($word_string, 6)); + return $this->getWordInfoFromString(substr($word_string, 8)); } else if ($cmp < 0) { $high = $check_loc; $check_loc = (($low + $check_loc) >> 1); } else { + if($check_loc + 1 == $high) { + $check_loc++; + } $low = $check_loc; $check_loc = (($high + $check_loc) >> 1); } @@ -775,18 +788,14 @@ class IndexShard extends PersistentStructure implements CrawlConstants */ function getWordInfoLoaded($word_id) { - $first = $word_id[0]; - $second = $word_id[1]; - $rest_id = substr($word_id, 2); - if(!isset($this->words[$first][$second][$rest_id])) { + if(!isset($this->words[$word_id])) { return false; } if(!$this->word_docs_packed){ $this->packWordDocs(); } - return $this->getWordInfoFromString( - $this->words[$first][$second][$rest_id]); + $this->words[$word_id]); } /** @@ -800,7 +809,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants { $first_offset = self::unpackInt(substr($str, 0, 4)); $len = self::unpackInt(substr($str, 4, 4)); - $last_offset = $first_offset + $len; + $last_offset = $first_offset + $len - self::POSTING_LEN; $count = $len >> 2; return array($first_offset, $last_offset, $count); @@ -819,7 +828,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants { if($this->read_only_from_disk) { $base_offset = self::HEADER_LENGTH + - $this->firsts_len + $this->seconds_len + $this->words_len; + $this->prefixes_len + $this->words_len; return $this->getShardSubstring($base_offset + $offset, $len); } return substr($this->word_docs, $offset, $len); @@ -837,8 +846,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants function getDocInfoSubstring($offset, $len) { if($this->read_only_from_disk) { - $base_offset = self::HEADER_LENGTH + $this->words_len - + $this->firsts_len + $this->seconds_len + $this->word_docs_len; + $base_offset = self::HEADER_LENGTH + $this->prefixes_len + + $this->words_len + $this->word_docs_len; return $this->getShardSubstring($base_offset + $offset, $len); } return substr($this->doc_infos, $offset, $len); @@ -912,26 +921,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants self::headerToShardFields($header, $this); } - /** - * - */ - function unpackFirstSeconds($firsts, $seconds) - { - $pre_firsts_array = str_split($firsts, 4); - array_walk($pre_firsts_array, 'IndexShard::makeFirsts', $this); - - $total_offset = 0; - foreach($this->firsts as $first => $seconds_len) { - for($offset=0; $offset < $seconds_len; $offset += 4) { - $pre_out = self::unpackInt( - substr($seconds,$total_offset +$offset,4)); - $second = chr(($pre_out >> 24)); - $third_len = 0x00FFFFFF & $pre_out; - $this->seconds[$first][$second] = $third_len; - } - $total_offset += $seconds_len; - } - } /** * Load an IndexShard from a file @@ -945,30 +934,16 @@ class IndexShard extends PersistentStructure implements CrawlConstants $fh = fopen($fname, "rb"); $header = fread($fh, self::HEADER_LENGTH); self::headerToShardFields($header, $shard); - $firsts = fread($fh, $shard->firsts_len); - $seconds = fread($fh, $shard->seconds_len); + fread($fh, $shard->prefixes_len ); $words = fread($fh, $shard->words_len); $shard->word_docs = fread($fh, $shard->word_docs_len); $shard->doc_infos = fread($fh, $shard->docids_len); fclose($fh); - $shard->unpackFirstSeconds($firsts, $seconds); - unset($firsts); - unset($seconds); - $total_offset = 0; - foreach($shard->seconds as $first => $seconds_info) { - foreach($seconds_info as $second => $third_len) { - for($offset = 0; $offset < $third_len; - $offset += self::WORD_ITEM_LEN) { - $value = substr($words, - $total_offset + $offset, self::WORD_ITEM_LEN); - $rest_id = substr($value, 0, 6); - $info = substr($value, 6); - $shard->words[$first][$second][$rest_id] = $info; - } - $total_offset += $third_len; - } - } + + $pre_words_array = str_split($words, self::WORD_ITEM_LEN); unset($words); + array_walk($pre_words_array, 'IndexShard::makeWords', $shard); + return $shard; } @@ -983,16 +958,15 @@ class IndexShard extends PersistentStructure implements CrawlConstants { $header_array = str_split($header, 4); $header_data = array_map('IndexShard::unpackInt', $header_array); - $shard->firsts_len = $header_data[0]; - $shard->seconds_len = $header_data[1]; - $shard->words_len = $header_data[2]; - $shard->word_docs_len = $header_data[3]; - $shard->docids_len = $header_data[4]; - $shard->generation_offset = $header_data[5]; - $shard->num_docs = $header_data[6]; - $shard->num_link_docs = $header_data[7]; - $shard->len_all_docs = $header_data[8]; - $shard->len_all_link_docs = $header_data[9]; + $shard->prefixes_len = $header_data[0]; + $shard->words_len = $header_data[1]; + $shard->word_docs_len = $header_data[2]; + $shard->docids_len = $header_data[3]; + $shard->generation_offset = $header_data[4]; + $shard->num_docs = $header_data[5]; + $shard->num_link_docs = $header_data[6]; + $shard->len_all_docs = $header_data[7]; + $shard->len_all_link_docs = $header_data[8]; } /** @@ -1007,12 +981,18 @@ class IndexShard extends PersistentStructure implements CrawlConstants return $tmp[1]; } - static function makeFirsts(&$value, $key, &$shard) + + /** + * Callback function for load method. splits a word_key . word_info string + * into an entry in the passed shard $shard->words[word_key] = $word_info. + * + * @param string &value the word_key . word_info string + * @param int $key index in array - we don't use + * @param object $shard IndexShard to add the entry to word table for + */ + static function makeWords(&$value, $key, &$shard) { - $pre_out = self::unpackInt($value); - $first = chr($pre_out >> 24); - $seconds_len = (0x00FFFFFF & $pre_out); - $shard->firsts[$first] = $seconds_len; + $shard->words[substr($value, 0, 8)] = substr($value, 8, 8); } } diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php index 22d2d9008..b281028b1 100755 --- a/lib/phrase_parser.php +++ b/lib/phrase_parser.php @@ -133,7 +133,7 @@ class PhraseParser static function extractPhrasesOfLengthOffset($string, $phrase_len, $offset) { - $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|"; + $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\||\&"; $words = mb_split("[[:space:]]|".$punct, $string); $stems = array(); diff --git a/lib/utility.php b/lib/utility.php index 9fbbf6918..c35cab166 100755 --- a/lib/utility.php +++ b/lib/utility.php @@ -200,10 +200,8 @@ function crawlCrypt($string, $salt = NULL) * precision * * @param string $start starting time with microseconds - * @param string $end ending time with microseconds + * @param string $end ending time with microseconds, if null use current time * @return float time difference in seconds - * @see SigninModel::changePassword() - * @see SigninModel::checkValidSignin() */ function changeInMicrotime( $start, $end=NULL ) { diff --git a/models/phrase_model.php b/models/phrase_model.php index 134775de1..5acd07ded 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -276,7 +276,7 @@ class PhraseModel extends Model $index_archive_name = self::index_data_base_name . $index_name; $index_archive = new IndexArchiveBundle( CRAWL_DIR.'/cache/'.$index_archive_name); - $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|"; + $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\||\&"; $phrase_string = mb_ereg_replace($punct, " ", $phrase_string); $phrase_string = preg_replace("/(\s)+/", " ", $phrase_string); /* @@ -322,7 +322,7 @@ class PhraseModel extends Model $restrict_phrases = array_unique($restrict_phrases); $restrict_phrases = array_filter($restrict_phrases); $index_archive->setCurrentShard(0, true); - $words_array = $index_archive->getSelectiveWords($hashes, 10); + $words_array = $index_archive->getSelectiveWords($hashes, 5); if(is_array($words_array)) { reset($words_array); diff --git a/tests/index_shard_test.php b/tests/index_shard_test.php index 4bc2c6144..36cbe65b9 100644 --- a/tests/index_shard_test.php +++ b/tests/index_shard_test.php @@ -81,7 +81,7 @@ class IndexShardTest extends UnitTest /** * Check if can store documents into an index shard and retrieve them */ - public function addDocumentsGetWordSliceByIdTestCase() + public function addDocumentsGetPostingsSliceByIdTestCase() { $docid = "AAAAAAAA"; $offset = 5; @@ -101,7 +101,7 @@ class IndexShardTest extends UnitTest $this->assertEqual($this->test_objects['shard']->len_all_docs, 9, "Len All Docs Correctly Counts Length of First Doc"); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('CCCCCCCC', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Doc lookup by word works"); @@ -117,7 +117,7 @@ class IndexShardTest extends UnitTest ); $this->test_objects['shard']->addDocumentWords($docid, $offset, $word_counts, $meta_ids); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('CCCCCCCC', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Work lookup first item of two works"); @@ -127,7 +127,7 @@ class IndexShardTest extends UnitTest "Exactly two items were found in two item case"); //add a meta word lookup - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('EEEEEEEE', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Doc lookup by meta word works"); @@ -140,7 +140,7 @@ class IndexShardTest extends UnitTest /** * Check if can store link documents into an index shard and retrieve them */ - public function addLinkGetWordSliceByIdTestCase() + public function addLinkGetPostingsSliceByIdTestCase() { $docid = "AAAAAAAA:BBBBBBBB:CCCCCCCC"; //set up link doc $offset = 5; @@ -159,7 +159,7 @@ class IndexShardTest extends UnitTest $offset, $word_counts, $meta_ids); $this->assertEqual($this->test_objects['shard']->len_all_link_docs, 9, "Len All Docs Correctly Counts Length of First Doc"); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('MMMMMMMM', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA:BBBBBBBB:CCCCCCCC"]), "Link Doc lookup by word works"); @@ -178,7 +178,7 @@ class IndexShardTest extends UnitTest $this->test_objects['shard']->addDocumentWords($docid, $offset, $word_counts, $meta_ids); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('MMMMMMMM', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA:BBBBBBBB:CCCCCCCC"]), "Link Doc lookup by word works 1st of two"); @@ -233,15 +233,15 @@ class IndexShardTest extends UnitTest $offset, $word_counts, $meta_ids); $this->test_objects['shard']->appendIndexShard( $this->test_objects['shard2']); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('BBBBBBBB', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Data from first shard present 1"); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('CCCCCCCC', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Data from first shard present 2"); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('DDDDDDDD', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Data from first shard present 3"); @@ -249,27 +249,27 @@ class IndexShardTest extends UnitTest "Data from second shard present 1"); $this->assertTrue(isset($c_data["GGGGGGGG"]), "Data from third shard present 1"); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('EEEEEEEE', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Data from first shard present 4"); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('FFFFFFFF', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Data from first shard present 5"); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('ZZZZZZZZ', true), 5); $this->assertTrue(isset($c_data["KKKKKKKK:GGGGGGGG:HHHHHHHH"]), "Data from second shard present 2"); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('IIIIIIII', true), 5); $this->assertTrue(isset($c_data["GGGGGGGG"]), "Data from third shard present 2"); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('JJJJJJJJ', true), 5); $this->assertTrue(isset($c_data["GGGGGGGG"]), "Data from third shard present 3"); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('KKKKKKKK', true), 5); $this->assertTrue(isset($c_data["GGGGGGGG"]), "Data from third shard present 4"); @@ -317,7 +317,7 @@ class IndexShardTest extends UnitTest ); $this->test_objects['shard']->addDocumentWords($docid, $offset, $word_counts, $meta_ids); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('BBBBBBBB', true), 5); $new_doc_offsets = array( "AAAAAAAA" => 5, @@ -326,7 +326,7 @@ class IndexShardTest extends UnitTest "DDDDDDDD" => 7, ); $this->test_objects['shard']->changeDocumentOffsets($new_doc_offsets); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('BBBBBBBB', true), 5); $predicted_offsets = array( "AAAAAAAA" => 5, @@ -359,7 +359,7 @@ class IndexShardTest extends UnitTest $doc_urls = array("http://somewhere.com/"); $this->test_objects['shard']->markDuplicateDocs($doc_urls); - $c_data = $this->test_objects['shard']->getWordSliceById( + $c_data = $this->test_objects['shard']->getPostingsSliceById( crawlHash('info:http://somewhere.com/', true), 5); $this->assertTrue(isset( $c_data[crawlHash($doc_urls[0], true)][CrawlConstants::DUPLICATE]), @@ -390,23 +390,23 @@ class IndexShardTest extends UnitTest $this->test_objects['shard2'] = IndexShard::load("shard.txt"); $this->assertEqual($this->test_objects['shard2']->len_all_docs, 9, "Len All Docs Correctly Counts Length of First Doc"); - $c_data = $this->test_objects['shard2']->getWordSliceById( + $c_data = $this->test_objects['shard2']->getPostingsSliceById( crawlHash('BBBBBBBB', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Doc lookup by word works"); - $c_data = $this->test_objects['shard2']->getWordSliceById( + $c_data = $this->test_objects['shard2']->getPostingsSliceById( crawlHash('CCCCCCCC', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Doc lookup 2 by word works"); - $c_data = $this->test_objects['shard2']->getWordSliceById( + $c_data = $this->test_objects['shard2']->getPostingsSliceById( crawlHash('DDDDDDDD', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Doc lookup 2 by word works"); - $c_data = $this->test_objects['shard2']->getWordSliceById( + $c_data = $this->test_objects['shard2']->getPostingsSliceById( crawlHash('EEEEEEEE', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Doc lookup 2 by word works"); - $c_data = $this->test_objects['shard2']->getWordSliceById( + $c_data = $this->test_objects['shard2']->getPostingsSliceById( crawlHash('FFFFFFFF', true), 5); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Doc lookup 2 by word works");