diff --git a/bin/fetcher.php b/bin/fetcher.php index 75a2c2624..275845fb8 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -72,6 +72,8 @@ require_once BASE_DIR."/lib/crawl_daemon.php"; require_once BASE_DIR."/lib/fetch_url.php"; /** Loads common constants for web crawling*/ require_once BASE_DIR."/lib/crawl_constants.php"; +/** used to build miniinverted index*/ +require_once BASE_DIR."/lib/index_shard.php"; /* @@ -1033,231 +1035,56 @@ class Fetcher implements CrawlConstants } /** - * Builds an inverted index (word --> {docs it appears in}) for the current - * batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages. This inverted - * is then merged by the queue_server into the inverted index of the - * current generation of the crawl. The complete inverted index for the - * whole crawl is built out of these inverted indexes for generations. - * The point of computing a partial inverted index on the fetcher is to - * reduce some of the computational burden on the queue server. The - * resulting mini index computed by buildMiniInvertedIndex() is stored in + * Builds an inverted index shard (word --> {docs it appears in}) + * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages. + * This inverted index shard is then merged by the queue_server + * into the inverted index of the current generation of the crawl. + * The complete inverted index for the whole crawl is built out of these + * inverted indexes for generations. The point of computing a partial + * inverted index on the fetcher is to reduce some of the computational + * burden on the queue server. The resulting mini index computed by + * buildMiniInvertedIndex() is stored in * $this->found_sites[self::INVERTED_INDEX] * */ function buildMiniInvertedIndex() { $start_time = microtime(); - $words = array(); - $doc_statistics = $this->computeDocumentStatistics(); - $average_title_length = $doc_statistics[self::AVERAGE_TITLE_LENGTH]; - $average_description_length = - $doc_statistics[self::AVERAGE_DESCRIPTION_LENGTH]; - $average_total_link_text_length = - $doc_statistics[self::AVERAGE_TOTAL_LINK_TEXT_LENGTH]; - - $special_case_fields = array(self::INLINKS, self::SITE_INFO, - self::FILETYPE, self::URL_INFO); - - foreach($doc_statistics as $doc_key => $info) { - if(in_array($doc_key, $special_case_fields)) {continue;} - $title_length = $info[self::TITLE_LENGTH]; - $description_length = $info[self::DESCRIPTION_LENGTH]; - $link_length = $info[self::LINK_LENGTH]; - - $title_ratio = ($average_title_length > 0) ? - $title_length/$average_title_length : 0; - $description_ratio = ($average_description_length > 0) ? - $description_length/$average_description_length :0; - $link_ratio = ($average_total_link_text_length > 0) ? - $link_length/$average_total_link_text_length : 0; - - if(isset($info[self::TITLE_WORDS])) { - foreach($info[self::TITLE_WORDS] - as $word_key => $num_occurrences) { - $title_frequency = $num_occurrences/$title_length; - - $words[crawlHash($word_key)][$doc_key][ - self::TITLE_WORD_SCORE] = - number_format(3 * $title_frequency/ - ($title_frequency + .5 + 1.5* $title_ratio), - PRECISION); - $words[crawlHash($word_key)][$doc_key][ - self::DESCRIPTION_WORD_SCORE] = 0; - // will set in a moment if has value - $words[crawlHash($word_key)][$doc_key][ - self::LINK_WORD_SCORE] = 0; - } - } - - if(isset($info[self::DESCRIPTION_WORDS])) { - foreach($info[self::DESCRIPTION_WORDS] - as $word_key => $num_occurrences) { - $description_frequency = - $num_occurrences/$description_length; - - $words[crawlHash($word_key)][$doc_key][ - self::DESCRIPTION_WORD_SCORE] = - number_format(3 * $description_frequency/ - ($description_frequency - + .5 + 1.5* $description_ratio), PRECISION); - - if(!isset($words[crawlHash($word_key)][$doc_key][ - self::TITLE_WORD_SCORE])) { - $words[crawlHash($word_key)][$doc_key][ - self::TITLE_WORD_SCORE] = 0; - } - - $words[crawlHash($word_key)][$doc_key][ - self::LINK_WORD_SCORE] = 0; - } - } - - if(isset($info[self::LINK_WORDS])) { - foreach($info[self::LINK_WORDS] - as $word_key => $num_occurrences) { - $link_frequency = $num_occurrences/$link_length; - - $words[crawlHash($word_key)][$doc_key][ - self::LINK_WORD_SCORE] = number_format( - 3 * $link_frequency/ - ($link_frequency + .5 + 1.5* $link_ratio), - PRECISION); - - if(!isset($words[crawlHash($word_key)][$doc_key][ - self::TITLE_WORD_SCORE])) { - $words[crawlHash($word_key)][$doc_key][ - self::TITLE_WORD_SCORE] = 0; - } - - if(!isset($words[crawlHash($word_key)][$doc_key][ - self::DESCRIPTION_WORD_SCORE])) { - $words[crawlHash($word_key)][$doc_key][ - self::DESCRIPTION_WORD_SCORE] = 0; - } - } - } - - } // end foreach - - foreach($words as $word_key => $docs_info) { - foreach($docs_info as $doc_key => $info) { - $doc_depth = $doc_statistics[$doc_key][self::DOC_DEPTH]; - $doc_rank = (11 - $doc_depth) + - $doc_statistics[$doc_key][self::URL_WEIGHT]; - $words[$word_key][$doc_key][self::DOC_RANK] = - number_format($doc_rank, PRECISION); //proxy for page rank - - $orphan = (isset($info[self::LINK_WORDS]) && - $info[self::LINK_WORDS] == true) ? 1 : .5; - - $words[$word_key][$doc_key][self::SCORE] = number_format( - .8*($doc_rank) - + $info[self::TITLE_WORD_SCORE] - + 2*$info[self::DESCRIPTION_WORD_SCORE]*$orphan - + 1.5*$info[self::LINK_WORD_SCORE], PRECISION); - - } - } - - - //add word_keys for inlink, sites, filetype - foreach($special_case_fields as $special_case_field) { - if(isset($doc_statistics[$special_case_field])) { - foreach($doc_statistics[$special_case_field] - as $url_word_key => $docs_info) { - foreach($docs_info as $doc_key) { - $doc_depth = $doc_statistics[$doc_key][self::DOC_DEPTH] + 1; - $words[$url_word_key][$doc_key][self::TITLE_WORD_SCORE] = 0; - $words[$url_word_key][$doc_key][ - self::DESCRIPTION_WORD_SCORE] = 0; - $words[$url_word_key][$doc_key][self::LINK_WORD_SCORE] = 0; - $words[$url_word_key][$doc_key][self::DOC_RANK] = - number_format(11 - $doc_depth, PRECISION); - $words[$url_word_key][$doc_key][self::SCORE] = - number_format(11 - $doc_depth, PRECISION); - } - - } - } - } - foreach($this->found_duplicates as $duplicate) { - $doc_key = crawlHash($duplicate); - $url_word_key = crawlHash("info:".$duplicate); - $words[$url_word_key][$doc_key][self::TITLE_WORD_SCORE] = -1; - $words[$url_word_key][$doc_key][self::DESCRIPTION_WORD_SCORE] = -1; - $words[$url_word_key][$doc_key][self::LINK_WORD_SCORE] = -1; - $words[$url_word_key][$doc_key][self::DOC_RANK] = -1; - $words[$url_word_key][$doc_key][self::SCORE] = -1; - } - $this->found_duplicates = array(); - - $this->found_sites[self::INVERTED_INDEX] = $words; - - crawlLog(" Build mini inverted index time ". - (changeInMicrotime($start_time))); - } - - /** - * Used to compute number of words in each component (title, description, - * links) of a document separately as well as compute average amongst the - * current group of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many docs. - * - * @return array computed statistics - */ - function computeDocumentStatistics() - { - $doc_statistics = array(); $num_seen = count($this->found_sites[self::SEEN_URLS]); $this->num_seen_sites += $num_seen; + /* + for the fetcher we are not saving the index shards so + name doesn't matter. + */ + $index_shard = new IndexShard("fetcher_shard"); for($i = 0; $i < $num_seen; $i++) { $site = $this->found_sites[self::SEEN_URLS][$i]; - $doc_key = crawlHash($site[self::URL]); - - $doc_statistics[$doc_key][self::URL_WEIGHT] = - 3 - log(strlen($site[self::URL])); //negative except short urls - $doc_statistics[$doc_key][self::DOC_DEPTH] = - log($site[self::INDEX]*NUM_FETCHERS, 10); - $title_phrase_string = - mb_ereg_replace("[[:punct:]]", " ", $site[self::TITLE]); - $doc_statistics[$doc_key][self::TITLE_WORDS] = - PhraseParser::extractPhrasesAndCount($title_phrase_string); - $doc_statistics[$doc_key][self::TITLE_LENGTH] = - $this->sumCountArray( - $doc_statistics[$doc_key][self::TITLE_WORDS]); - $this->sum_seen_site_title_length += - $doc_statistics[$doc_key][self::TITLE_LENGTH]; - - $description_phrase_string = - mb_ereg_replace("[[:punct:]]", " ", $site[self::DESCRIPTION]); - $doc_statistics[$doc_key][self::DESCRIPTION_WORDS] = - PhraseParser::extractPhrasesAndCount( - $description_phrase_string); - $doc_statistics[$doc_key][self::DESCRIPTION_LENGTH] = - $this->sumCountArray( - $doc_statistics[$doc_key][self::DESCRIPTION_WORDS]); - $this->sum_seen_site_description_length += - $doc_statistics[$doc_key][self::DESCRIPTION_LENGTH]; - $doc_statistics[$doc_key][self::LINK_WORDS] = array(); - $doc_statistics[$doc_key][self::LINK_LENGTH] = 0; + $doc_key = crawlHash($site[self::URL], true); + $word_counts = array(); + $phrase_string = + mb_ereg_replace("[[:punct:]]", " ", $site[self::TITLE] . + " ". $site[self::DESCRIPTION]); + $word_counts = + PhraseParser::extractPhrasesAndCount($phrase_string); + + $meta_ids = array(); + // store the sites the doc_key belongs to, so you can search by site $url_sites = UrlParser::getHostPaths($site[self::URL]); $url_sites = array_merge($url_sites, UrlParser::getHostSubdomains($site[self::URL])); foreach($url_sites as $url_site) { if(strlen($url_site) > 0) { - $doc_statistics[self::SITE_INFO][ - crawlHash('site:'.$url_site)][] = $doc_key; + $meta_ids[] = 'site:'.$url_site; } } - $doc_statistics[self::URL_INFO][ - crawlHash('info:'.$site[self::URL])][] = $doc_key; + $meta_ids[] = 'info:'.$site[self::URL]; // store the filetype info $url_type = UrlParser::getDocumentType($site[self::URL]); if(strlen($url_type) > 0) { - $doc_statistics[self::FILETYPE][ - crawlHash('filetype:'.$url_type)][] = $doc_key; + $meta_ids[] = 'filetype:'.$url_type; } $link_phrase_string = ""; @@ -1271,6 +1098,7 @@ class Fetcher implements CrawlConstants } $had_links = false; + $link_shard = new IndexShard("link_shard"); foreach($site[self::LINKS] as $url => $link_text) { if(strlen($url) > 0) { $summary = array(); @@ -1278,8 +1106,9 @@ class Fetcher implements CrawlConstants $link_text = strip_tags($link_text); $link_id = "url|".$url."|text|$link_text|ref|".$site[self::URL]; - $link_key = crawlHash($link_id).":".crawlHash($url).":" - .crawlHash("info:".$url); + $link_key = crawlHash($link_id, true).":". + crawlHash($url, true).":" + .crawlHash("info:".$url, "true"); $summary[self::URL] = $link_id; $summary[self::TITLE] = $url; // stripping html to be on the safe side @@ -1289,72 +1118,33 @@ class Fetcher implements CrawlConstants $summary[self::HASH] = crawlHash($link_id); $summary[self::TYPE] = "link"; $summary[self::HTTP_CODE] = "link"; - $summary[self::WEIGHT] = $link_weight; $this->found_sites[self::SEEN_URLS][] = $summary; - - $doc_statistics[$link_key][self::URL_WEIGHT] = - 3 - log(strlen($url)); - //negative except short urls - $doc_statistics[$link_key][self::TITLE_WORDS] =array(); - $doc_statistics[$link_key][self::TITLE_LENGTH] = 0; - $doc_statistics[$link_key][self::DESCRIPTION_WORDS] = - array(); - $doc_statistics[$link_key][self::DESCRIPTION_LENGTH] = 0; $link_text = mb_ereg_replace("[[:punct:]]", " ", $link_text); - $doc_statistics[$link_key][self::LINK_WORDS] = + $link_word_counts = PhraseParser::extractPhrasesAndCount($link_text); - $doc_statistics[$link_key][self::LINK_LENGTH] = - $this->sumCountArray( - $doc_statistics[$link_key][self::LINK_WORDS]); - $this->sum_seen_site_link_length += - $doc_statistics[$link_key][self::LINK_LENGTH]; - - $doc_statistics[$link_key][self::DOC_DEPTH] = - log(10*$site[self::INDEX]*NUM_FETCHERS, 10); - //our proxy for page rank, 10=average links/page - $doc_statistics[self::INLINKS][crawlHash('link:'.$url)][] = - $doc_key; + $link_shard->addDocumentWords($link_key, 0, + $link_word_counts, array()); + + $meta_ids[] = 'link:'.$url; } - $this->found_sites[self::SEEN_URLS][$i][self::LINKS] = - $had_links; - } + } + $index_shard->addDocumentWords($doc_key, 0, $word_counts, + $meta_ids); + + $index_shard->appendIndexShard($link_shard); } + $index_shard->markDuplicateDocs($this->found_duplicates); - $doc_statistics[self::AVERAGE_TITLE_LENGTH] = - $this->sum_seen_site_title_length/$this->num_seen_sites; - - $doc_statistics[self::AVERAGE_DESCRIPTION_LENGTH] = - $this->sum_seen_site_description_length/$this->num_seen_sites; - - $doc_statistics[self::AVERAGE_TOTAL_LINK_TEXT_LENGTH] = - $this->sum_seen_site_link_length/$this->num_seen_sites; - - crawlLog("AVERAGE TITLE LENGTH". - $doc_statistics[self::AVERAGE_TITLE_LENGTH]); - crawlLog("AVERAGE DESCRIPTION LENGTH". - $doc_statistics[self::AVERAGE_DESCRIPTION_LENGTH]); - crawlLog("AVERAGE TOTAL LINK TEXT LENGTH". - $doc_statistics[self::AVERAGE_TOTAL_LINK_TEXT_LENGTH]); - return $doc_statistics; - } + $this->found_duplicates = array(); - /** - * Computes a sum of the values of an associative array of key-value pairs - * - * @param array &$arr the associative array to compute the sum of - */ - function sumCountArray(&$arr) - { - $sum = 0; - foreach($arr as $key => $value) { - $sum += $value; - } + $this->found_sites[self::INVERTED_INDEX] = & $index_shard; - return $sum; + crawlLog(" Build mini inverted index time ". + (changeInMicrotime($start_time))); } } diff --git a/bin/queue_server.php b/bin/queue_server.php index 8677edf76..d190d1fdf 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -383,7 +383,7 @@ class QueueServer implements CrawlConstants CRAWL_DIR.'/cache/'. self::index_data_base_name.$this->crawl_time, URL_FILTER_SIZE, NUM_ARCHIVE_PARTITIONS, - NUM_INDEX_PARTITIONS, serialize($info)); + serialize($info)); } else { $dir = CRAWL_DIR.'/cache/'. self::index_data_base_name.$this->crawl_time; @@ -505,7 +505,6 @@ class QueueServer implements CrawlConstants $start_time = microtime(); - $index_archive = $this->index_archive; $fh = fopen($file, "rb"); $machine_string = fgets($fh); $len = strlen($machine_string); @@ -526,7 +525,8 @@ class QueueServer implements CrawlConstants if(isset($sites[self::SEEN_URLS]) && count($sites[self::SEEN_URLS]) > 0) { $seen_sites = $sites[self::SEEN_URLS]; - $index_archive->differenceContainsPages($seen_sites, self::HASH); + $this->index_archive->differenceContainsPages( + $seen_sites, self::HASH); $seen_sites = array_values($seen_sites); $num_seen = count($seen_sites); } else { @@ -535,18 +535,18 @@ class QueueServer implements CrawlConstants $visited_urls_count = 0; for($i = 0; $i < $num_seen; $i++) { - $index_archive->addPageFilter(self::HASH, $seen_sites[$i]); + $this->index_archive->addPageFilter(self::HASH, $seen_sites[$i]); $seen_sites[$i][self::MACHINE] = $machine; $seen_sites[$i][self::MACHINE_URI] = $machine_uri; $seen_sites[$i][self::HASH_URL] = - crawlHash($seen_sites[$i][self::URL]); + crawlHash($seen_sites[$i][self::URL], true); $link_url_parts = explode("|", $seen_sites[$i][self::URL]); if(strcmp("url", $link_url_parts[0]) == 0 && strcmp("text", $link_url_parts[2]) == 0) { $seen_sites[$i][self::HASH_URL] = - crawlHash($seen_sites[$i][self::URL]). - ":".crawlHash($link_url_parts[1]). - ":".crawlHash("info:".$link_url_parts[1]); + crawlHash($seen_sites[$i][self::URL], true). + ":".crawlHash($link_url_parts[1],true). + ":".crawlHash("info:".$link_url_parts[1], true); } else { $visited_urls_count++; } @@ -554,7 +554,7 @@ class QueueServer implements CrawlConstants if(isset($seen_sites)) { $seen_sites = - $index_archive->addPages( + $this->index_archive->addPages( self::HASH_URL, self::SUMMARY_OFFSET, $seen_sites, $visited_urls_count); @@ -563,38 +563,25 @@ class QueueServer implements CrawlConstants $summary_offsets[$site[self::HASH_URL]] = $site[self::SUMMARY_OFFSET]; } - crawlLog("B memory usage".memory_get_usage() . + crawlLog("B (dedup + random) memory usage".memory_get_usage() . " time: ".(changeInMicrotime($start_time))); $start_time = microtime(); // added summary offset info to inverted index data - if(isset($sites[self::INVERTED_INDEX])) { - $index_data = & $sites[self::INVERTED_INDEX]; - foreach( $index_data as $word_key => $docs_info) { - foreach($docs_info as $doc_key => $info) { - if(isset($summary_offsets[$doc_key])) { - $index_data[$word_key][$doc_key][ - self::SUMMARY_OFFSET] = - $summary_offsets[$doc_key]; - } - } - } + if(isset($sites[self::INVERTED_INDEX])) { + $index_shard = & $sites[self::INVERTED_INDEX]; + $index_shard->changeDocumentOffsets($summary_offsets); } } - crawlLog("C memory usage".memory_get_usage() . - " time: ".(changeInMicrotime($start_time))); - $start_time = microtime(); - $index_archive->forceSave(); - crawlLog("D memory usage".memory_get_usage() . + crawlLog("C (update shard offsets) memory usage".memory_get_usage() . " time: ".(changeInMicrotime($start_time))); $start_time = microtime(); - if(isset($index_data)) { - $index_archive->addIndexData($index_data); + if(isset($index_shard)) { + $this->index_archive->addIndexData($index_shard); } - crawlLog("E memory usage".memory_get_usage(). + crawlLog("D (add index shard) memory usage".memory_get_usage(). " time: ".(changeInMicrotime($start_time))); - crawlLog("Done Processing File: $file"); unlink($file); diff --git a/configs/config.php b/configs/config.php index ac1b4768a..6a011ff87 100755 --- a/configs/config.php +++ b/configs/config.php @@ -152,14 +152,8 @@ define('MIN_QUEUE_WEIGHT', 1/100000); /** number of web archive files to use to store web pages in */ define('NUM_ARCHIVE_PARTITIONS', 10); -/** - * number of web archive files to use for the inverted index of - * word->docs in a given generation - */ -define('NUM_INDEX_PARTITIONS', 250); - -/** number of words before next gen */ -define('NUM_WORDS_PER_GENERATION', 6*URL_FILTER_SIZE/NUM_INDEX_PARTITIONS); +/** number of documents before next gen */ +define('NUM_DOCS_PER_GENERATION', 10000); /** number of generations to sample in estimating number of urls in a query */ define('SAMPLE_GENERATIONS', 3); @@ -167,15 +161,6 @@ define('SAMPLE_GENERATIONS', 3); /** precision to round floating points document scores */ define('PRECISION', 10); -/** - * when index data from relatively uncommon words, - * how many docs should be grouped together in a block - */ -define('BLOCK_SIZE', 50); - -/** how many documents a word needs to be to get its own index file. */ -define('COMMON_WORD_THRESHOLD', 1000); - /** maximum number of links to consider on any given page */ define('MAX_LINKS_PER_PAGE', 50); @@ -263,7 +248,7 @@ $PAGE_PROCESSORS = array( "text/html" => "HtmlProcessor", * How many non robot urls the fetcher successfully downloads before * between times data sent back to queue server */ -define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', 400); +define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', 500); /** maximum number of urls to schedule to a given fetcher in one go */ define ('MAX_FETCH_SIZE', 5000); diff --git a/controllers/search_controller.php b/controllers/search_controller.php index 8b027f373..716ab82ee 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -231,7 +231,7 @@ class SearchController extends Controller implements CrawlConstants $this->phraseModel->lookupSummaryOffset($url); } $crawl_item = $this->crawlModel->getCrawlItem( - crawlHash($url), $summary_offset); + crawlHash($url, true), $summary_offset); $top_phrases = $this->phraseModel->getTopPhrases($crawl_item, 3); @@ -332,7 +332,7 @@ class SearchController extends Controller implements CrawlConstants $summary_offset = $this->phraseModel->lookupSummaryOffset($url); } - if(!$crawl_item = $this->crawlModel->getCrawlItem(crawlHash($url), + if(!$crawl_item = $this->crawlModel->getCrawlItem(crawlHash($url, true), $summary_offset)) { $this->displayView("nocache", $data); diff --git a/lib/bloom_filter_bundle.php b/lib/bloom_filter_bundle.php index 6962e7b34..4615adc85 100644 --- a/lib/bloom_filter_bundle.php +++ b/lib/bloom_filter_bundle.php @@ -102,7 +102,7 @@ class BloomFilterBundle $this->num_filters++; $this->filter_size = $filter_size; $this->current_filter->save(); - $this->saveMetaData(); + $this->saveMetaData(); } else { $last_filter = $this->num_filters - 1; $this->current_filter = @@ -132,12 +132,13 @@ class BloomFilterBundle $this->filter_size); $this->current_filter_count = 0; $this->num_filters++; + $this->saveMetaData(); } $this->current_filter->add($value); $this->current_filter_count++; - $this->saveMetaData(); + } /** diff --git a/lib/bloom_filter_file.php b/lib/bloom_filter_file.php index 77c494d87..dcc4c12ec 100755 --- a/lib/bloom_filter_file.php +++ b/lib/bloom_filter_file.php @@ -102,9 +102,9 @@ class BloomFilterFile extends PersistentStructure function add($value) { $num_keys = $this->num_keys; + $pos_array = $this->getHashBitPositionArray($value, $num_keys); for($i = 0; $i < $num_keys; $i++) { - $pos = $this->getHashBitPosition($value.$i); - $this->setBit($pos); + $this->setBit($pos_array[$i]); } $this->checkSave(); @@ -119,10 +119,9 @@ class BloomFilterFile extends PersistentStructure function contains($value) { $num_keys = $this->num_keys; + $pos_array = $this->getHashBitPositionArray($value, $num_keys); for($i = 0; $i < $num_keys; $i++) { - $pos = $this->getHashBitPosition($value.$i); - - if(!$this->getBit($pos)) { + if(!$this->getBit($pos_array[$i])) { return false; } } @@ -136,15 +135,31 @@ class BloomFilterFile extends PersistentStructure * @param string $value value to map to a bit position in the filter * @return int the bit position mapped to */ - function getHashBitPosition($value) + function getHashBitPositionArray($value, $num_keys) { - $hash = substr(md5($value, true), 0, 4); - $int_array = unpack("N", $hash); - $seed = $int_array[1]; + $md5 = md5($value, true); + $seed = array(); + for($i = 0; $i < 16; $i += 4) { + $hash = substr($md5, $i, 4); + $int_array = unpack("N", $hash); + $seed[] = $int_array[1]; + } - mt_srand($seed); - $pos = mt_rand(0, $this->filter_size -1); - return $pos; + //$pos_array = array_fill(0, $num_keys, 0); + $pos_array = array(); + $offset = $num_keys >> 2; + $size = $this->filter_size - 1; + $index = 0; + for($j = 0; $j < $num_keys; $j += $offset) { + $high = $j + $offset; + if($index < 4) { + mt_srand($seed[$index++]); + } + for($i = $j; $i < $high; $i++) { + $pos_array[$i] = mt_rand(0, $size); + } + } + return $pos_array; } /** @@ -154,7 +169,7 @@ class BloomFilterFile extends PersistentStructure */ function setBit($i) { - $byte = ($i >> 3);; + $byte = ($i >> 3); $bit_in_byte = $i - ($byte << 3); diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index 12a98b732..b836591ea 100644 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -131,5 +131,8 @@ interface CrawlConstants const HASH_SEEN_URLS ='aj'; const RECENT_URLS ='ak'; const MEMORY_USAGE ='al'; + const DOC_ID ='am'; + const RELEVANCE ='an'; + const DUPLICATE ='ao'; } ?> diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index 6ce39b8da..2fa1bef34 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -38,9 +38,9 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} */ require_once 'web_archive_bundle.php'; /** - * Bloom Filter used by BloomFilterBundle + * Used to store word index */ -require_once 'bloom_filter_file.php'; +require_once 'index_shard.php'; /** * Used to check if a page already stored in the WebArchiveBundle */ @@ -60,46 +60,6 @@ require_once 'crawl_constants.php'; require_once 'indexing_constants.php'; -/** - * Callback function used to set the offsets into the archive file from - * the particular word info in the header block of a WordArchive - * - * @param array $data - * @param array $objects - * @param string $offset_field - */ -function setOffsetPointers($data, &$objects, $offset_field) -{ - $count = count($objects); - - for($i = 0 ; $i < $count ; $i++ ) { - if(isset($objects[$i][$offset_field]) ) { - $offset = $objects[$i][$offset_field]; - foreach($objects[$i] as $word_key_and_block_num => $docs_info) { - $tmp = explode(":", $word_key_and_block_num); - if(isset($tmp[1]) ) { - list($word_key, $block_num) = $tmp; - if(strcmp($word_key, "offset") != 0) { - if(($block_num +1)*BLOCK_SIZE < - COMMON_WORD_THRESHOLD) { - $data[$word_key][$block_num] = $offset; - } else if(isset( - $docs_info[IndexingConstants::POINT_BLOCK])) { - $data[$word_key][IndexingConstants::LIST_OFFSET] = - $offset; - } - } - } - } - - } - } - - return $data; -} - - - /** * Encapsulates a set of web page summaries and an inverted word-index of terms * from these summaries which allow one to search for summaries containing a @@ -108,34 +68,12 @@ function setOffsetPointers($data, &$objects, $offset_field) * The basic file structures for an IndexArchiveBundle are: * <ol> * <li>A WebArchiveBundle for web page summaries.</li> - * <li>A set of WebArchiveBundles for the inverted index. Each such bundle - * is called a <b>generation</b>. These bundles have name index0, index1,... - * The file generations.txt keeps track of what is the current generation - * and how many words have been stored in it. A given generation can - * hold NUM_WORDS_PER_GENERATION words amongst all its partitions. After which - * the next generation begins. In a given generation, a word is stored in - * the partition that its hash key hashes to. The same word may appear in - * several generations. The info block for a partition for a particular - * generation contains objects for each word of the generation that hashed - * to that partition. Each such word object contains a count of the number - * of documents it occurred in for that generation. It also has an - * array of block_pointers to blocks of size BLOCK_SIZE. These blocks contains - * documents that the word occurred in, the score for the occurrence, and - * an offset into the summary file for that document. If the total number of - * documents is not a multiple of BLOCK_SIZE the remaining documents are stored - * directly in the word's info block object. If, in a given generation, a - * word occurs more than COMMON_WORD_THRESHOLD many times then the word object - * uses a LIST_OFFSET pointer to point to a linked list in the partition of - * addtional blocks of documents for that word. + * <li>A set of inverted index generations. These generations + * have name index0, index1,... + * The file generations.txt keeps track of what is the current generation. + * A given generation can hold NUM_WORDS_PER_GENERATION words amongst all + * its partitions. After which the next generation begins. * </li> - * <li>For each partition and for all generations a BloomFilterFile is used - * to keep track of which words appear in which generations for a - * particular partition. These filters are stored in a folder within the - * IndexArchiveBundle called index_filters. When a word and documents - * containing it are stored in an IndexArchiveBundle, its word_key (its has) is - * stored in the filter for the partition its word_key hash to. Further - * if the current generation is i, then work_ket concatenated with i is - * also stored in this same filter.</li> * </ol> * * @@ -145,12 +83,7 @@ function setOffsetPointers($data, &$objects, $offset_field) */ class IndexArchiveBundle implements IndexingConstants, CrawlConstants { - /** - * Used to keep track of the time to perform various operations - * in this IndexArchiveBundle - * @var array - */ - var $diagnostics; + /** * Folder name to use for this IndexArchiveBundle * @var string @@ -166,12 +99,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants * @int */ var $num_partitions_summaries; - /** - * Number of partitions in the inverted word index - * (same for each generation) - * @int - */ - var $num_partitions_index; + /** * structure contains info about the current generation: * its index (ACTIVE), and the number of words it contains @@ -180,26 +108,20 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants */ var $generation_info; /** - * Number of words before a new generation is started + * Number of docs before a new generation is started * @int */ - var $num_words_per_generation; + var $num_docs_per_generation; /** * WebArchiveBundle for web page summaries * @object */ var $summaries; /** - * WebArchiveBundle for inverted word index + * Index Shard for current generation inverted word index * @object */ - var $index; - /** - * Bloom Filters used to figure out which words are in which generations for - * given paritions - * @object - */ - var $index_partition_filters; + var $current_shard; /** * Makes or initializes an IndexArchiveBundle with the provided parameters @@ -216,8 +138,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants * IndexArchiveBundle */ public function __construct($dir_name, $filter_size = -1, - $num_partitions_summaries = NULL, $num_partitions_index = NULL, - $description = NULL) + $num_partitions_summaries = NULL, $description = NULL, + $num_docs_per_generation = NUM_DOCS_PER_GENERATION) { $this->dir_name = $dir_name; @@ -225,7 +147,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants if(!is_dir($this->dir_name)) { mkdir($this->dir_name); - mkdir($this->dir_name."/index_filters"); + } else { $index_archive_exists = true; @@ -236,7 +158,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants file_get_contents($this->dir_name."/generation.txt")); } else { $this->generation_info['ACTIVE'] = 0; - $this->generation_info['NUM_WORDS'] = 0; file_put_contents($this->dir_name."/generation.txt", serialize($this->generation_info)); } @@ -246,13 +167,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants $this->num_partitions_summaries = $this->summaries->num_partitions; - $this->index = new WebArchiveBundle( - $dir_name."/index".$this->generation_info['ACTIVE'], -1, - $num_partitions_index); - $this->num_partitions_index = $this->index->num_partitions; $this->description = $this->summaries->description; - $this->num_words_per_generation = NUM_WORDS_PER_GENERATION; + $this->num_docs_per_generation = $num_docs_per_generation; } @@ -283,228 +200,98 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants * @param array $index_data a mini inverted index of word_key=>doc data * to add to this IndexArchiveBundle */ - public function addIndexData($index_data) + public function addIndexData($index_shard) { - $out_data = array(); - - if(!count($index_data) > 0) return; - - /* Arrange the words according to the partitions they are in - */ - - $this->diagnostics['SELECT_TIME'] = 0; - $this->diagnostics['INFO_BLOCKS_TIME'] = 0; - $this->diagnostics['ADD_FILTER_TIME'] = 0; - $this->diagnostics['ADD_OBJECTS_TIME'] = 0; + crawlLog("**ADD INDEX DIAGNOSTIC INFO..."); $start_time = microtime(); - foreach($index_data as $word_key => $docs_info) { - - $partition = WebArchiveBundle::selectPartition( - $word_key, $this->num_partitions_index); - $out_data[$partition][$word_key] = $docs_info; - - } - $this->diagnostics['SELECT_TIME'] += changeInMicrotime($start_time); - - /* for each partition add the word data for the partition to the - partition web archive - */ - $cnt = 0; - foreach($out_data as $partition => $word_data) { - $this->addPartitionWordData($partition, $word_data); - $cnt++; + $current_num_docs = $this->getActiveShard()->num_docs; + $add_num_docs = $index_shard->num_docs; + if($current_num_docs + $add_num_docs > $this->num_docs_per_generation){ + $switch_time = microtime(); + $this->forceSave(); + $this->generation_info['ACTIVE']++; + $this->generation_info['CURRENT'] = + $this->generation_info['ACTIVE']; + $current_index_shard_file = $this->dir_name."/index". + $this->generation_info['ACTIVE']; + $this->current_shard = new IndexShard( + $current_index_shard_file, $this->generation_info['ACTIVE'] * + $this->num_docs_per_generation); + file_put_contents($this->dir_name."/generation.txt", + serialize($this->generation_info)); + crawlLog("Switch Shard time:".changeInMicrotime($switch_time)); } - file_put_contents($this->dir_name."/generation.txt", - serialize($this->generation_info)); - $out_data = NULL; - gc_collect_cycles(); - - crawlLog("**ADD INDEX DIAGNOSTIC INFO..."); - crawlLog("**Time calculating select partition functions ". - $this->diagnostics['SELECT_TIME']); - crawlLog("**Time reading info blocks ". - $this->diagnostics['INFO_BLOCKS_TIME']); - crawlLog("**Time adding objects to index ". - $this->diagnostics['ADD_OBJECTS_TIME']); - crawlLog("**Time adding to filters ". - $this->diagnostics['ADD_FILTER_TIME']); - crawlLog("**Number of partitions ".$cnt); - + $this->getActiveShard()->appendIndexShard($index_shard); + crawlLog("Append Index Shard: Memory usage:".memory_get_usage() . + " Time: ".(changeInMicrotime($start_time))); } /** - * Adds the mini-inverted index data that to a particular partition. - * It is assume the word keys in this data would hash to the destined - * index partitions - * - * @param int $partition WebArchive in the index WebArchiveBundle of the - * current generation to write to - * @param array &$word_data what to wrtie - * @param bool $overwrite whether to signal that all data in prior - * generations associated with keys that are being inserted should be - * ignored (for instance, multi-word search are partially computed and - * added to the index. If these get recomputed we might want to ignore - * prior work. ) + * Sets the current shard to be the active shard (the active shard is + * what we call the last (highest indexed) shard in the bundle. Then + * returns a reference to this shard + * @return &object last shard in the bundle */ - public function addPartitionWordData($partition, - &$word_data, $overwrite = false) - { - $start_time = microtime(); - - $block_data = $this->readPartitionInfoBlock($partition); - - if(isset($this->diagnostics['INFO_BLOCKS_TIME'])) { - $this->diagnostics['INFO_BLOCKS_TIME'] += - changeInMicrotime($start_time); + public function &getActiveShard() + { + if($this->setCurrentShard($this->generation_info['ACTIVE'])) { + return $this->getCurrentShard(); + } else if(!isset($this->current_shard) ) { + $current_index_shard_file = $this->dir_name."/index". + $this->generation_info['CURRENT']; + $this->current_shard = new IndexShard($current_index_shard_file, + $this->generation_info['CURRENT']*$num_docs_per_generation); } - - if($block_data == NULL) { - $block_data[self::NAME] = $partition; - } - - //update counts set-up add link to offset linked lists - $out_data = array(); - $out_data[0] = array(); - - $this->initPartitionIndexFilter($partition); - - foreach($word_data as $word_key => $docs_info) { - $start_time = microtime(); + return $this->current_shard; + } - $this->addPartitionIndexFilter($partition, $word_key); - $this->addPartitionIndexFilter( - $partition, $word_key . $this->generation_info['ACTIVE']); - if(isset($this->diagnostics['ADD_FILTER_TIME'])) { - $this->diagnostics['ADD_FILTER_TIME'] += - changeInMicrotime($start_time); + /** + * Returns the shard which is currently being used to read word-document + * data from the bundle. If one wants to write data to the bundle use + * getActiveShard() instead. The point of this method is to allow + * for lazy reading of the file associated with the shard. + * + * @return &object the currently being index shard + */ + public function &getCurrentShard() + { + if(!isset($this->current_shard)) { + if(!isset($this->generation_info['CURRENT'])) { + $this->generation_info['CURRENT'] = + $this->generation_info['ACTIVE']; } - - if(!isset($block_data[$word_key]) || $overwrite == true) { - unset($block_data[$word_key]); - $block_data[$word_key][self::COUNT] = 0; - $block_data[$word_key][self::END_BLOCK] = array(); - $block_data[$word_key][self::LIST_OFFSET] = NULL; - $unfilled_block_num = 0; - + $current_index_shard_file = $this->dir_name."/index". + $this->generation_info['CURRENT']; + + if(file_exists($current_index_shard_file) ) { + $this->current_shard = + IndexShard::load($current_index_shard_file); } else { - $unfilled_block_num = - floor($block_data[$word_key][self::COUNT] / BLOCK_SIZE); + $this->current_shard = new IndexShard($current_index_shard_file, + $this->generation_info['CURRENT']* + $this->num_docs_per_generation); } - - $cnt = count($docs_info); - $block_data[$word_key][self::COUNT] += $cnt; - - $tmp = - array_merge($block_data[$word_key][self::END_BLOCK],$docs_info); - uasort($tmp, "docRankOrderCallback"); - $add_cnt = count($tmp); - $num_blocks = floor($add_cnt / BLOCK_SIZE); - $block_data[$word_key][self::END_BLOCK] = - array_slice($tmp, $num_blocks*BLOCK_SIZE); - - $first_common_flag = true; - $min_common = NULL; - $slice_cnt = $num_blocks - 1; - for($i = $unfilled_block_num + $num_blocks - 1; - $i >= $unfilled_block_num ; $i--) { - $out_data[0][$word_key .":". $i] = - array_slice($tmp, $slice_cnt*BLOCK_SIZE, BLOCK_SIZE); - if(($i+1)*BLOCK_SIZE > COMMON_WORD_THRESHOLD) { - $min_common = $i; - if($first_common_flag) { - if(isset($block_data[$word_key][self::LIST_OFFSET])) { - $out_data[0][$word_key .":". $i][self::LIST_OFFSET]= - $block_data[$word_key][self::LIST_OFFSET]; - } else { - $out_data[0][$word_key .":". $i][self::LIST_OFFSET]= - NULL; - } - $first_common_flag = false; - } else { - $out_data[0][$word_key .":". $i][self::LIST_OFFSET] = - NULL; // next in list is in same block - } - } - - $slice_cnt--; - } - if($min_common !== NULL) { - $out_data[ - 0][$word_key .":". $min_common][self::POINT_BLOCK] = 0; - // this index needs to point to previous block with word - } - } - - $start_time = microtime(); - $this->index->addObjectsPartition("offset", $partition, - $out_data, $block_data, "setOffsetPointers", false); - - if(isset($this->diagnostics['ADD_OBJECTS_TIME'])) { - $this->diagnostics['ADD_OBJECTS_TIME'] += - changeInMicrotime($start_time); - } - - - if($this->generation_info['NUM_WORDS']>$this->num_words_per_generation){ - $index_filter_size = $this->index->filter_size; - $this->generation_info['ACTIVE']++; - $this->generation_info['NUM_WORDS'] = 0; - $this->index = new WebArchiveBundle( - $this->dir_name."/index".$this->generation_info['ACTIVE'], - $index_filter_size, $this->num_partitions_index); - file_put_contents( - $this->dir_name."/generation.txt", - serialize($this->generation_info)); - } - - } + return $this->current_shard; + } /** - * Adds the provided $word_key to the BloomFilter for the given partition + * Sets the current shard to be the $i th shard in the index bundle. * - * @param int $partition whose Bloom Filter we want to add the word_key to - * @param string $word_key the key to add - * @return bool whether the add was successful + * @param $i which shard to set the current shard to be */ - public function addPartitionIndexFilter($partition, $word_key) - { - if($this->initPartitionIndexFilter($partition) === false) { + public function setCurrentShard($i) + { + if(isset($this->generation_info['CURRENT']) && + $i == $this->generation_info['CURRENT'] || + $i > $this->generation_info['ACTIVE']) { return false; + } else { + $this->generation_info['CURRENT'] = $i; + return true; } - if(!$this->index_partition_filters[$partition]->contains($word_key)) { - $this->generation_info['NUM_WORDS']++; - $this->index_partition_filters[$partition]->add($word_key); - } - - return true; - } - - /** - * Initializes or constructs the Bloom filter assocaited with a partition - * @param int $partition index of desired partition - * @return bool whether the operation was successful - */ - public function initPartitionIndexFilter($partition) - { - if(!isset($this->index_partition_filters[$partition])) { - if(file_exists($this->dir_name. - "/index_filters/partition$partition.ftr")) { - $this->index_partition_filters[$partition] = - BloomFilterFile::load( - $this->dir_name . - "/index_filters/partition$partition.ftr"); - } else { - $filter_size = $this->num_words_per_generation; - $this->index_partition_filters[$partition] = - new BloomFilterFile( - $this->dir_name . - "/index_filters/partition$partition.ftr", $filter_size); - } - } - return true; - } + } /** * Gets the page out of the summaries WebArchiveBundle with the given @@ -521,44 +308,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants return $this->summaries->getPage($key, $offset); } - /** - * Returns a block of documents a word occur in. The doc block looked up - * is at a given offset into the word's partition WebArchive for a given - * generation. This is used when the word occurs more the - * COMMON_WORD_THRESHOLD many times in a generation - * - * @param string $word_key hash of word whose doc block we are looking up - * @param int $offset byte offset into word's partition WebArchive for the - * supplied generation - * @param int $generation which generation to look up the doc block of - * @return array the desired doc block - */ - public function getWordDocBlock($word_key, $offset, $generation = -1) - { - if($generation == -1) { - return $this->index->getPage($word_key, $offset); - } else { - $archive = - new WebArchiveBundle($this->dir_name."/index".$generation); - return $archive->getPage($word_key, $offset); - } - } - /** - * Gets a page using in WebArchive $partition of the word index - * using the provided byte $offset and using existing $file_handle - * if possible. - * - * @param int $partition which WebArchive to look in - * @param int $offset byte offset of page data - * @param resource $file_handle file handle resource of $partition archive - * @return array desired page - */ - public function getPageByPartition($partition, $offset, $file_handle = NULL) - { - return $this->index->getPageByPartition( - $partition, $offset, $file_handle); - } /** * Adds the given summary to the summary exists filter bundle @@ -588,125 +338,15 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * Forces the data in the page exists filter bundle of summaries - * to be save to disk, forces each index partition summary to be saved + * to be save to disk, forces the current shard to be saved, the current + * filter in the index filter bundle to be save */ public function forceSave() { $this->summaries->forceSave(); - for($i = 0; $i < $this->num_partitions_index; $i++) { - if(isset($this->index_partition_filters[$i]) && - $this->index_partition_filters[$i] != NULL) { - $this->index_partition_filters[$i]->save(); - } - } + $this->getActiveShard()->save(); } - /** - * Computes statistics for the provided phrase_key. - * These include an estimate of the total number of documents it occurs in, - * as well as which generations it occurs in, and what are its info block - * looks like in the current generation - * - * @param string $phrase_key what to compute statistics for - * @param int $generation_index the current generation - * @param array $info_block info_block of the phrase_key (will look up - * if not provided) - * @return array info for this $phrase_key - */ - public function getPhraseIndexInfo( - $phrase_key, $generation_index = 0, $info_block = NULL) - { - - $partition = - WebArchiveBundle::selectPartition( - $phrase_key, $this->num_partitions_index); - $info = array(); - if($info_block == NULL) { - - if(!$this->initPartitionIndexFilter($partition)) { - return NULL; - } - $filter = & $this->index_partition_filters[$partition]; - - if($filter == NULL || !$filter->contains($phrase_key)) { - return NULL; - } - - $active_generation = $this->generation_info['ACTIVE']; - - $min_generation = 0; - for($i = 0; $i <= $active_generation; $i++) { - if($filter->contains($phrase_key . $i)) { - if($filter->contains("delete". $phrase_key . $i)) { - $info['GENERATIONS'] = array(); - //truncate all previously seen - } else { - $info['GENERATIONS'][] = $i; - } - } - } - $num_generations = count($info['GENERATIONS']); - if($num_generations == 0) { - return NULL; - } - - $sample_size = min($num_generations, SAMPLE_GENERATIONS); - $sum_count = 0; - for($i = 0; $i < $sample_size; $i++) { - $block_info = - $this->readPartitionInfoBlock( - $partition, $info['GENERATIONS'][$i]); - $sum_count += $block_info[$phrase_key][self::COUNT]; - } - - $info['TOTAL_COUNT'] = - ceil(($sum_count*$num_generations)/$sample_size); - // this is an estimate - } else { - $info['TOTAL_COUNT'] = $info_block['TOTAL_COUNT']; - $info['GENERATIONS'] = $info_block['GENERATIONS']; - } - - $block_info = $this->readPartitionInfoBlock( - $partition, $info['GENERATIONS'][$generation_index]); - $phrase_info = $block_info[$phrase_key]; - - $info['CURRENT_GENERATION_INDEX'] = $generation_index; - - if(isset($phrase_info)) { - $phrase_info['CURRENT_GENERATION_INDEX'] = - $info['CURRENT_GENERATION_INDEX']; - $phrase_info['TOTAL_COUNT'] = $info['TOTAL_COUNT']; - $phrase_info['GENERATIONS'] = $info['GENERATIONS']; - return $phrase_info; - } else { - return NULL; - } - - } - - /** - * Sets the information associated with a word in the inverted index - * - * @param string $phrase_key - * @param array $info - */ - public function setPhraseIndexInfo($phrase_key, $info) - { - $partition = WebArchiveBundle::selectPartition( - $phrase_key, $this->num_partitions_index); - - $partition_block_data = $this->readPartitionInfoBlock($partition); - - if($partition_block_data == NULL || !is_array($partition_block_data)) { - $partition_block_data = array(); - } - - $partition_block_data[$phrase_key] = $info; - - $this->writePartitionInfoBlock($partition, $partition_block_data); - - } /** * Computes the words which appear in the fewest or most documents @@ -723,12 +363,11 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants if(!is_array($word_keys) || count($word_keys) < 1) { return NULL;} foreach($word_keys as $word_key) { - $info = $this->getPhraseIndexInfo($word_key); - if(isset($info['TOTAL_COUNT'])) { - $words_array[$word_key] = $info['TOTAL_COUNT']; - } else { + $tmp = $this->getCurrentShard()->getWordInfo($word_key); + if($tmp === false) { $words_array[$word_key] = 0; - return NULL; + } else { + $words_array[$word_key] = $tmp[2]; } } @@ -737,34 +376,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants return array_slice($words_array, 0, $num); } - /** - * Reads the info block of $partition index WebArchive - * - * @param int $partition WebArchive to read from - * @return array data in its info block - */ - public function readPartitionInfoBlock($partition, $generation = -1) - { - if($generation == -1) { - return $this->index->readPartitionInfoBlock($partition); - } else { - $archive = new WebArchiveBundle( - $this->dir_name."/index".$generation); - return $archive->readPartitionInfoBlock($partition); - } - - } - - /** - * Write $data into the info block of the $partition index WebArchive - * - * @param int $partition WebArchive to write into - * @param array $data what to write - */ - public function writePartitionInfoBlock($partition, $data) - { - $this->index->writePartitionInfoBlock($partition, $data); - } /** * Gets the description, count of summaries, and number of partions of the diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index 74adf2085..d5ace8fa8 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -78,6 +78,9 @@ class GroupIterator extends IndexBundleIterator */ var $count_block; + /** + * + */ var $current_block_hashes; /** @@ -98,13 +101,11 @@ class GroupIterator extends IndexBundleIterator * * @param object $index_bundle_iterator to use as a source of documents * to iterate over - * @param int $limit the first element to return from the list of docs - * iterated over + */ - function __construct($index_bundle_iterator, $limit = 0) + function __construct($index_bundle_iterator) { $this->index_bundle_iterator = $index_bundle_iterator; - $this->limit = $limit; $this->num_docs = $this->index_bundle_iterator->num_docs; $this->reset(); } @@ -116,26 +117,10 @@ class GroupIterator extends IndexBundleIterator function reset() { $this->index_bundle_iterator->reset(); - $time = time(); $this->grouped_keys = array(); // -1 == never save, so file name not used using time to be safer $this->seen_docs = 0; $this->seen_docs_unfiltered = 0; - $beneath_limit = true; - while($beneath_limit == true) { - - $doc_block = $this->currentDocsWithWord(); - if($doc_block == -1 || !is_array($doc_block)) { - $beneath_limit = false; - continue; - } - if($this->seen_docs + $this->count_block >= $this->limit) { - $beneath_limit = false; - continue; - } - $this->advance(); - } - } /** @@ -148,6 +133,7 @@ class GroupIterator extends IndexBundleIterator { $pages = $this->index_bundle_iterator->currentDocsWithWord(); + $this->count_block_unfiltered = count($pages); if(!is_array($pages)) { return $pages; @@ -159,28 +145,28 @@ class GroupIterator extends IndexBundleIterator if($this->count_block_unfiltered > 0 ) { $i = $this->seen_docs; foreach($pages as $doc_key => $doc_info) { - if(!is_array($doc_info)) {continue;} + if(!is_array($doc_info) || + isset($doc_info[self::DUPLICATE])) {continue;} $doc_info['KEY'] = $doc_key; - $doc_key_parts = explode(":", $doc_key); - if(count($doc_key_parts) == 1) { - $hash_url = $doc_key_parts[0]; + if(strlen($doc_key) == 8) { + $hash_url = $doc_key; $doc_info['IS_PAGE'] = true; } else { + $doc_key_parts = array( + substr($doc_key, 0, 8),substr($doc_key, 9, 8), + substr($doc_key, 18, 8) + ); $hash_url = $doc_key_parts[1]; $doc_info['IS_PAGE'] = false; } if(isset($this->grouped_keys[$hash_url])) { - if( $i < $this->limit) { - continue; - } else { - if(isset($pre_out_pages[$hash_url]) ) { - $pre_out_pages[$hash_url][] = $doc_info; - if($doc_info['IS_PAGE'] == true) { - $pre_out_pages[$hash_url]['IS_PAGE'] = true; - } else { - $pre_out_pages[$hash_url]['HASH_INFO_URL'] = - $doc_key_parts[2]; - } + if(isset($pre_out_pages[$hash_url]) ) { + $pre_out_pages[$hash_url][] = $doc_info; + if($doc_info['IS_PAGE'] == true) { + $pre_out_pages[$hash_url]['IS_PAGE'] = true; + } else { + $pre_out_pages[$hash_url]['HASH_INFO_URL'] = + $doc_key_parts[2]; } } } else { @@ -202,17 +188,17 @@ class GroupIterator extends IndexBundleIterator $hash_info_url= $pre_out_pages[$hash_url]['HASH_INFO_URL']; $word_iterator = new WordIterator($hash_info_url, - $this->getIndex(), 0); + $this->getIndex(), true); $doc_array = $word_iterator->currentDocsWithWord(); if(is_array($doc_array) && count($doc_array) == 1) { $keys = array_keys($doc_array); $key = $keys[0]; - if($doc_array[$key][self::DOC_RANK] > -1) { + if(!isset($doc_array[$key][self::DUPLICATE]) ) { $pre_out_pages[$hash_url][$key] = $doc_array[$key]; $pre_out_pages[$hash_url][$key]['IS_PAGE'] = true; } else { /* - Deduplication: idea is if the score < 0 + Deduplication: a deduplicate info: page was written, so we should ignore that group. */ @@ -230,15 +216,6 @@ class GroupIterator extends IndexBundleIterator } $this->count_block = count($pre_out_pages); - if($this->seen_docs < $this->limit) { - $total_docs = $this->seen_docs + $this->count_block; - if($total_docs < $this->limit) { - $pre_out_pages =array(); - } else { - $pre_out_pages = array_slice($pre_out_pages, - $this->limit - $this->seen_docs, NULL, true); - } - } $out_pages = array(); foreach($pre_out_pages as $hash_url => $group_infos) { foreach($group_infos as $doc_info) { @@ -312,7 +289,7 @@ class GroupIterator extends IndexBundleIterator $out_pages[$doc_key] = $doc_info; foreach($doc_info[self::SUMMARY_OFFSET] as $offset_array) { list($key, $summary_offset) = $offset_array; - $index = $this->getIndex($key); + $index = & $this->getIndex($key); $page = $index->getPage( $key, $summary_offset); if(!isset($out_pages[$doc_key][self::SUMMARY])) { @@ -358,9 +335,9 @@ class GroupIterator extends IndexBundleIterator /** * Returns the index associated with this iterator - * @return object the index + * @return &object the index */ - function getIndex($key = NULL) + function &getIndex($key = NULL) { return $this->index_bundle_iterator->getIndex($key); } diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php index cd33b23c5..9bb7fec8c 100644 --- a/lib/index_bundle_iterators/index_bundle_iterator.php +++ b/lib/index_bundle_iterators/index_bundle_iterator.php @@ -63,13 +63,6 @@ abstract class IndexBundleIterator implements IndexingConstants, CrawlConstants */ var $seen_docs; - /** - * First document that should be returned - * amongst all of the documents associated with the - * iterator's $word_key - * @var int - */ - var $limit; /** * The number of documents in the current block * @var int @@ -104,7 +97,7 @@ abstract class IndexBundleIterator implements IndexingConstants, CrawlConstants * Returns the index associated with this iterator * @return object the index */ - abstract function getIndex($key = NULL); + abstract function &getIndex($key = NULL); /** * Hook function used by currentDocsWithWord to return the current block diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php index a53ece59c..6bfa74121 100644 --- a/lib/index_bundle_iterators/intersect_iterator.php +++ b/lib/index_bundle_iterators/intersect_iterator.php @@ -102,13 +102,10 @@ class IntersectIterator extends IndexBundleIterator * * @param object $index_bundle_iterator to use as a source of documents * to iterate over - * @param int $limit the first element to return from the list of docs - * iterated over */ - function __construct($index_bundle_iterators, $limit = 0) + function __construct($index_bundle_iterators) { $this->index_bundle_iterators = $index_bundle_iterators; - $this->limit = $limit; $this->num_iterators = count($index_bundle_iterators); $this->num_docs = -1; @@ -138,19 +135,8 @@ class IntersectIterator extends IndexBundleIterator $this->seen_docs = 0; $this->seen_docs_unfiltered = 0; - $beneath_limit = true; - while($beneath_limit == true) { - $doc_block = $this->currentDocsWithWord(); - if($doc_block == -1 || !is_array($doc_block)) { - $beneath_limit = false; - continue; - } - if($this->seen_docs + $this->count_block >= $this->limit) { - $beneath_limit = false; - continue; - } - $this->advance(); - } + $doc_block = $this->currentDocsWithWord(); + } /** @@ -294,7 +280,7 @@ class IntersectIterator extends IndexBundleIterator * Returns the index associated with this iterator * @return object the index */ - function getIndex($key = NULL) + function &getIndex($key = NULL) { return $this->index_bundle_iterators[0]->getIndex($key = NULL); } diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php index 36b98dddf..a844c3e9d 100644 --- a/lib/index_bundle_iterators/phrase_filter_iterator.php +++ b/lib/index_bundle_iterators/phrase_filter_iterator.php @@ -119,14 +119,11 @@ class PhraseFilterIterator extends IndexBundleIterator * phrases * @param float $weight a quantity to multiply each score returned from * this iterator with - * @param int $limit the first element to return from the list of docs - * iterated over */ function __construct($index_bundle_iterator, $restrict_phrases, - $disallow_phrases, $weight = 1, $limit = 0) + $disallow_phrases, $weight = 1) { $this->index_bundle_iterator = $index_bundle_iterator; - $this->limit = $limit; $this->restrict_phrases = $restrict_phrases; $this->disallow_phrases = $disallow_phrases; $this->num_docs = $this->index_bundle_iterator->num_docs; @@ -144,19 +141,7 @@ class PhraseFilterIterator extends IndexBundleIterator $this->index_bundle_iterator->reset(); $this->seen_docs = 0; $this->seen_docs_unfiltered = 0; - $beneath_limit = true; - while($beneath_limit == true) { - $doc_block = $this->currentDocsWithWord(); - if($doc_block == -1 || !is_array($doc_block)) { - $beneath_limit = false; - continue; - } - if($this->seen_docs + $this->count_block > $this->limit) { - $beneath_limit = false; - continue; - } - $this->advance(); - } + $doc_block = $this->currentDocsWithWord(); } /** @@ -224,15 +209,6 @@ class PhraseFilterIterator extends IndexBundleIterator } $this->count_block = count($pages); - if($this->seen_docs < $this->limit) { - $total_docs = $this->seen_docs + $this->count_block; - if($total_docs < $this->limit) { - $pages =array(); - } else { - $pages = array_slice($pages, - $this->limit - $this->seen_docs, NULL, true); - } - } $this->summaries = $pages; $this->pages = array(); foreach($pages as $doc_key => $doc_info) { @@ -301,9 +277,9 @@ class PhraseFilterIterator extends IndexBundleIterator /** * Returns the index associated with this iterator - * @return object the index + * @return &object the index */ - function getIndex($key = NULL) + function &getIndex($key = NULL) { return $this->index_bundle_iterator->getIndex($key = NULL); } diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php index bfbb3b5ac..4c79ec0a0 100644 --- a/lib/index_bundle_iterators/union_iterator.php +++ b/lib/index_bundle_iterators/union_iterator.php @@ -96,13 +96,10 @@ class UnionIterator extends IndexBundleIterator * * @param object $index_bundle_iterator to use as a source of documents * to iterate over - * @param int $limit the first element to return from the list of docs - * iterated over */ - function __construct($index_bundle_iterators, $limit = 0) + function __construct($index_bundle_iterators) { $this->index_bundle_iterators = $index_bundle_iterators; - $this->limit = $limit; /* estimate number of results by sum of all iterator counts, then improve estimate as iterate @@ -127,19 +124,7 @@ class UnionIterator extends IndexBundleIterator $this->seen_docs = 0; $this->seen_docs_unfiltered = 0; - $beneath_limit = true; - while($beneath_limit == true) { - $doc_block = $this->currentDocsWithWord(); - if($doc_block == -1 || !is_array($doc_block)) { - $beneath_limit = false; - continue; - } - if($this->seen_docs + $this->count_block >= $this->limit) { - $beneath_limit = false; - continue; - } - $this->advance(); - } + $doc_block = $this->currentDocsWithWord(); } /** @@ -199,7 +184,6 @@ class UnionIterator extends IndexBundleIterator $keys = array_keys($this->pages); } $out_pages = array(); - echo "hello".$this->pages[$key[0]]["ITERATOR"]."<br/>"; foreach($keys as $doc_key) { if(!isset($this->pages[$doc_key]["ITERATOR"])) { continue; @@ -238,9 +222,9 @@ class UnionIterator extends IndexBundleIterator /** * Returns the index associated with this iterator - * @return object the index + * @return &object the index */ - function getIndex($key = NULL) + function &getIndex($key = NULL) { if($key != NULL) { if($this->current_block_fresh == false) { diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php index 36a028246..a1e31358d 100644 --- a/lib/index_bundle_iterators/word_iterator.php +++ b/lib/index_bundle_iterators/word_iterator.php @@ -90,73 +90,61 @@ class WordIterator extends IndexBundleIterator var $index; /** - * If iterating through the linked-list portions of the documents - * the next byte offset in the WebArchive based linked-list + * The next byte offset in the IndexShard * @var int */ var $next_offset; - /** - * Block number of the last block of docs - * @var int - */ - var $last_pointed_block; - /** - * @var int - */ - var $list_offset; /** - * Pointers to offsets for blocks containing docs with the given word - * for the current generation - * @var array - */ - var $block_pointers; - /** - * Number of completely full blocks of documents for the current generation + * The current byte offset in the IndexShard * @var int */ - var $num_full_blocks; + var $current_offset; + /** - * Number of generations word appears in + * Last Offset of word occurence in the IndexShard * @var int */ - var $num_generations; + var $last_offset; + /** - * Used to store the contents of the last partially full block + * Keeps track of whether the word_iterator list is empty becuase the + * word does not appear in the index shard * @var int */ - var $last_block; - /** - * the info block of the WebArchive that the word lives in - * @var object - */ - var $info_block; + var $empty; + /** - * Stores the number of the current block of documents we are at in the - * set of all blocks of BLOCK_SIZE many documents + * Number of documents returned for each block (at most) * @var int */ - var $current_pointer; + const RESULTS_PER_BLOCK = 2000; /** * Creates a word iterator with the given parameters. * * @param string $word_key hash of word or phrase to iterate docs of - * @param object $index the IndexArchiveBundle to use + * @param object &$index the IndexArchiveBundle to use * @param int $limit the first element to return from the list of docs * iterated over - * @param object $info_block the info block of the WebArchive - * associated with the word in the index. If NULL, then this will - * loaded in WordIterator::reset() + * @param bool $raw whether the $word_key is our variant of base64 encoded */ - function __construct($word_key, $index, $limit = 0, $info_block = NULL) + function __construct($word_key, &$index, $raw = false) { $this->word_key = $word_key; - $this->index = $index; - $this->limit = $limit; - $this->info_block = $info_block; + + $this->index = & $index; $this->current_block_fresh = false; - $this->reset(); + $tmp = $index->getCurrentShard()->getWordInfo($word_key, $raw); + if ($tmp === false) { + $this->empty = true; + } else { + list($this->current_offset, $this->last_offset, $this->num_docs) + = $tmp; + $this->empty = false; + + $this->reset(); + } } /** @@ -168,99 +156,9 @@ class WordIterator extends IndexBundleIterator { $this->count_block = 0; $this->seen_docs = 0; - - $partition = - WebArchiveBundle::selectPartition($this->word_key, - $this->index->num_partitions_index); - if($this->info_block == NULL) { - $this->info_block = - $this->index->getPhraseIndexInfo($this->word_key); - } - if($this->info_block !== NULL) { - $this->num_generations = count($this->info_block['GENERATIONS']); - $count_till_generation = $this->info_block[self::COUNT]; - - while($this->limit >= $count_till_generation) { - $this->info_block['CURRENT_GENERATION_INDEX']++; - if($this->num_generations <= - $this->info_block['CURRENT_GENERATION_INDEX']) { - $this->num_docs = 0; - $this->current_pointer = -1; - return; - } - $info_block = $this->index->getPhraseIndexInfo( - $this->word_key, - $this->info_block['CURRENT_GENERATION_INDEX'], - $this->info_block); - if($info_block !== NULL) { - $this->info_block = $info_block; - } - $count_till_generation += $this->info_block[self::COUNT]; - } - $this->seen_docs = $count_till_generation - - $this->info_block[self::COUNT]; - - } - - - $this->initGeneration(); - } - /** - * Sets up the iterator to iterate through the current generation. - * - * @return bool whether the initialization succeeds - */ - function initGeneration() - { - - if($this->info_block !== NULL) { - $info_block = $this->index->getPhraseIndexInfo( - $this->word_key, $this->info_block['CURRENT_GENERATION_INDEX'], - $this->info_block); - if($info_block === NULL) { - return false; - } - $this->info_block = & $info_block; - $this->num_docs = $info_block['TOTAL_COUNT']; - $this->num_docs_generation = $info_block[self::COUNT]; - - $this->current_pointer = - max(floor(($this->limit - $this->seen_docs) / BLOCK_SIZE), 0); - $this->seen_docs += $this->current_pointer*BLOCK_SIZE; - $this->last_block = $info_block[self::END_BLOCK]; - $this->num_full_blocks = - floor($this->num_docs_generation / BLOCK_SIZE); - if($this->num_docs_generation > COMMON_WORD_THRESHOLD) { - $this->last_pointed_block = - floor(COMMON_WORD_THRESHOLD / BLOCK_SIZE); - } else { - $this->last_pointed_block = $this->num_full_blocks; - } - - for($i = 0; $i < $this->last_pointed_block; $i++) { - if(isset($info_block[$i])) { - $this->block_pointers[$i] = $info_block[$i]; - } - } - - if($this->num_docs_generation > COMMON_WORD_THRESHOLD) { - if($info_block[self::LIST_OFFSET] === NULL) { - $this->list_offset = NULL; - } else { - $this->list_offset = $info_block[self::LIST_OFFSET]; - } - } - - } else { - $this->num_docs = 0; - $this->num_docs_generation = 0; - $this->current_pointer = -1; - } - return true; - } /** * Hook function used by currentDocsWithWord to return the current block @@ -270,96 +168,13 @@ class WordIterator extends IndexBundleIterator */ function findDocsWithWord() { - if($this->num_generations <= - $this->info_block['CURRENT_GENERATION_INDEX']) { - $this->pages = NULL; - return -1; - } - $generation = - $this->info_block['GENERATIONS'][ - $this->info_block['CURRENT_GENERATION_INDEX']]; - if($this->current_pointer >= 0) { - if($this->current_pointer == $this->num_full_blocks) { - $pages = $this->last_block; - } else if ($this->current_pointer >= $this->last_pointed_block) { - /* if there are more than COMMON_WORD_THRESHOLD many - results and we're not at the last block yet - */ - if($this->list_offset === NULL) { - $this->pages = NULL; - return -1; - } - $offset = $this->list_offset; - $found = false; - do { - /* the link list is actually backwards to the order we want - For now, we cycle along the list from the last data - stored until we find the block we want. This is slow - but we are relying on the fact that each generation is - not too big. - */ - $doc_block = $this->index->getWordDocBlock($this->word_key, - $offset, $generation); - $word_keys = array_keys($doc_block); - $found_key = NULL; - foreach($word_keys as $word_key) { - if(strstr($word_key, $this->word_key.":")) { - $found_key = $word_key; - if(isset($doc_block[ - $found_key][self::LIST_OFFSET])) { - //only one list offset/docblock - break; - } - } - } - if($found_key === NULL) { - break; - } - if(isset($doc_block[ - $this->word_key.":".$this->current_pointer])) { - $found = true; - break; - } - $offset = $doc_block[$found_key][self::LIST_OFFSET]; - } while($offset != NULL); - if($found != true) { - $pages = array(); - } else { - $pages = & $doc_block[ - $this->word_key.":".$this->current_pointer]; - } - } else { - //first COMMON_WORD_THRESHOLD many results fast - if(isset($this->block_pointers[$this->current_pointer])) { - $doc_block = $this->index->getWordDocBlock($this->word_key, - $this->block_pointers[$this->current_pointer], - $generation); - if(isset( - $doc_block[$this->word_key.":".$this->current_pointer] - )) { - $pages = & - $doc_block[ - $this->word_key.":".$this->current_pointer]; - } else { - $pages = array(); - } - } else { - $pages = array(); - } - } - - if($this->seen_docs < $this->limit) { - $diff_offset = $this->limit - $this->seen_docs; - - $pages = array_slice($pages, $diff_offset); - } - $this->pages = & $pages; - $this->count_block = count($pages); - return $pages; - } else { - $this->pages = NULL; + if($this->current_offset > $this->last_offset || $this->empty) { return -1; } + $this->next_offset = $this->current_offset; + $results = $this->index->getCurrentShard()->getWordSlice( + $this->next_offset, $this->last_offset, self::RESULTS_PER_BLOCK); + return $results; } @@ -368,30 +183,21 @@ class WordIterator extends IndexBundleIterator */ function advance() { - if($this->current_pointer < 0) {return;} - $this->advanceSeenDocs(); - - $this->current_pointer ++; - if($this->current_pointer > $this->num_full_blocks) { - $flag = false; - while ($this->info_block['CURRENT_GENERATION_INDEX'] < - $this->num_generations - 1 && !$flag) { - $this->info_block['CURRENT_GENERATION_INDEX']++; - $flag = $this->initGeneration(); - } - if ($this->info_block['CURRENT_GENERATION_INDEX'] >= - $this->num_generations - 1) { - $this->current_pointer = - 1; - } + if($this->current_offset < $this->next_offset) { + $this->current_offset = $this->next_offset; + } else { + $this->current_offset = $this->last_offset + 1; } + + } /** * Returns the index associated with this iterator - * @return object the index + * @return &object the index */ - function getIndex($key = NULL) + function &getIndex($key = NULL) { return $this->index; } diff --git a/lib/index_shard.php b/lib/index_shard.php index e903bc5b1..d26ba0d3c 100644 --- a/lib/index_shard.php +++ b/lib/index_shard.php @@ -34,6 +34,23 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** + * Read in base class, if necessary + */ +require_once "persistent_structure.php"; + +/** + * Load charCopy + */ +require_once "utility.php"; + +/** + *Loads common constants for web crawling + */ +require_once BASE_DIR.'/lib/crawl_constants.php'; + +/** + * Data structure used to store one generation worth of the word document + * index (inverted index). * * @author Chris Pollett * @@ -41,51 +58,474 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} * @subpackage library */ -class IndexShard extends PersistentStructure implements Serializable +class IndexShard extends PersistentStructure implements CrawlConstants { - var $doc_ids; + /** + * Stores document id's and links to documents id's together with + * summary offset information, and number of words in the doc/link + * The format for a record is 8 bytes for a doc id, 1 bit is + * a link record flag, 31 bits for the summary offset (byte offset into + * web archive of the data for this document) and 4 bytes + * for number of words in doc. In the case of a link, there is + * an 8 byte link hash followed by the link record flag bit being on, + * followed by 31 bits for the summary offset, followed by 8 bytes for + * the hash of the url being pointed to by the link, followed by 8 + * bytes for the hash of "info:url_pointed_to_by_link", followed by 4 bytes + * for numbers of word in link. + * @var string + */ + var $doc_infos; + /** + * Length of $doc_infos as a string + * @var int + */ + var $docids_len; + /** + * A string consisting of interwoven linked-lists. A given linked-list + * store all the documents containing a given word. The format + * of a record in such a list consists of: 3 byte offset into $doc_infos + * for the document, followed by 1 byte recording the number of occurrence + * of the word in the document, followed by a four byte next pointer into + * the $word_docs string of the next record in the linked-list. + * @var string + */ var $word_docs; - var $count_doc256; + /** + * Length of $word_docs as a string + * @var int + */ + var $word_docs_len; + /** + * Used to store information about a word in this index shard. + * $words is an associative array, the key being an 8 byte word hash, + * the value being a 12 byte record. The first 4 bytes of this record + * being the offset to the start of the linked-list for that word in + * $word_docs, the next 4 bytes of this record being the last record + * for this word in the link-list, and the last 4 bytes recording the + * number of records in this linked-list. + * + * @var array + */ + var $words; + + /** + * This is supposed to hold the number of documents that have been stored + * in earlier shards, prior to the current shard. + */ + var $generation_offset; + /** + * Number of documents (not links) stored in this shard + * @var int + */ + var $num_docs; + /** + * Number of links (not documents) stored in this shard + * @var int + */ + var $num_link_docs; + /** + * Number of words stored in total in all documents in this shard + * @var int + */ + var $len_all_docs; + /** + * Number of words stored in total in all links in this shard + * @var int + */ + var $len_all_link_docs; - function __construct() + /** + * Used to keep track of whether a record in document infos is for a + * document or for a link + */ + const COMPOSITE_ID_FLAG = 0x80000000; + + /** + * Makes an index shard with the given file name and generation offset + * + * @param $fname filename to store the index shard with + * @param $generation_offset when returning documents from the shard + * pretend there ar ethis many earlier documents + */ + function __construct($fname, $generation_offset = 0) { + parent::__construct($fname, -1); + $this->generation_offset = $generation_offset; + $this->word_docs = ""; + $this->word_docs_len = 0; + $this->words = array(); + $this->docids_len = 0; + $this->doc_infos = ""; + $this->num_docs = 0; + $this->num_link_docs = 0; + $this->len_all_docs = 0; + $this->len_all_link_docs = 0; } - - function addDocumentWords($doc_id, $word_id_array) + + /** + * Add a new document to the index shard with the given summary offset. + * Associate with this document the supplied list of words and word counts. + * Finally, associate the given meta words with this document. + * + * @param string $doc_id id of document to insert + * @param int $summary_offset its offset into the word archive its data + * is stored in + * @param array $word_counts (word => number of occurences of word) pairs + * for each word in the document + * @param array $meta_ids meta words to be associated with the document + * an example meta word would be filetype:pdf for a PDF document. + */ + function addDocumentWords($doc_id, $summary_offset, $word_counts, + $meta_ids) { - $this->doc_ids[] = $doc_id; - - foreach($word_id_arr as $word_id => $relevance) { - $relevance = $relevance & 255; - $store = pack("N", $this->count_doc256 + $relevance); - $this->word_docs[$word_id] .= $store; + $is_doc = false; + $doc_len = 0; + $link_doc_len = 0; + if(strlen($doc_id) == 8) { //actual doc case + $this->doc_infos .= $doc_id . pack("N", $summary_offset); + $extra_offset = 0; + $this->num_docs++; + $is_doc = true; + } else { //link item + if(strlen($doc_id) !== 26) { + return false; + } + $id_parts = array(substr($doc_id, 0, 8), + substr($doc_id, 9, 8), substr($doc_id, 18, 8)); + $this->num_link_docs++; + $this->doc_infos .= $id_parts[0] . pack("N", + ($summary_offset | self::COMPOSITE_ID_FLAG)) . + $id_parts[1] . $id_parts[2]; + $extra_offset = 16; + } + foreach($meta_ids as $meta_id) { + $word_counts[$meta_id] = 0; + } + foreach($word_counts as $word => $occurrences) { + $word_id = crawlHash($word, true); + $occurrences = ($occurrences > 255 ) ? 255 : $occurrences & 255; + $store = pack("N", ($this->docids_len << 4) + $occurrences); + $store .= pack("N", $this->word_docs_len); + if(!isset($this->words[$word_id])) { + $value = pack("N", $this->word_docs_len); + $value .= $value.pack("N", 1); + } else { + $value = $this->words[$word_id]; + $first_string = substr($value, 0, 4); + $previous_string = substr($value, 4, 4); + $count_array = unpack("N", substr($value, 8, 4)); + $count = $count_array[1]; + if($count == 0x7FFFFFFF) { continue; } + $count++; + $value = $first_string . pack("N", $this->word_docs_len) . + pack("N", $count); + $tmp = unpack("N", $previous_string); + $previous = $tmp[1]; + $previous_info = substr($this->word_docs, $previous, 8); + $previous_doc_occ = substr($previous_info, 0, 4); + $offset = $this->word_docs_len - $previous; + $previous_info = $previous_doc_occ.pack("N", $offset); + charCopy($previous_info, $this->word_docs, $previous, 8); + } + $this->words[$word_id] = $value; + $this->word_docs .= $store; + $this->word_docs_len += 8; + if($occurrences > 0) { + if($is_doc == true) { + $doc_len += $occurrences; + } else { + $link_doc_len += $occurrences; + } + } } - $this->count_doc256 += 256; + $this->len_all_docs += $doc_len; + $this->len_all_link_docs += $link_doc_len; + if($is_doc == true) { + $this->doc_infos .= pack("N", $doc_len); + } else { + $this->doc_infos .= pack("N", $link_doc_len); + } + $this->docids_len += 16 + $extra_offset; } - function getWordSlice($word_id, $start, $len) + /** + * Returns the first offset, last offset, and number of documents the + * word occurred in for this shard. The first offset (similarly, the last + * offset) is the byte offset into the word_docs string of the first + * (last) record involving that word. + * + * @param string $word_id id of the word one wants to look up + * @param bool $raw whether the id is our version of base64 encoded or not + */ + function getWordInfo($word_id, $raw = false) { - $result = array(); - if(isset($word_docs[$word_id])) { - $docs_string = substr($word_docs[$word_id], $start << 2, $len <<2); - //check if got at least one item - if($docs_string !== false && ($doc_len = strlen($doc_string)) > 3) { - for($i = 0; $i < $doc_len; $i += 4) { + + if($raw == false) { + //get rid of out modfied base64 encoding + $hash = str_replace("_", "/", $word_id); + $hash = str_replace("-", "+" , $hash); + $hash .= "="; + $word_id = base64_decode($hash); + + } + + if(!isset($this->words[$word_id])) { + return false; + } + $first_string = substr($this->words[$word_id], 0, 4); + $tmp = unpack("N", $first_string); + $first_offset = $tmp[1]; + $last_string = substr($this->words[$word_id], 4, 4); + $tmp = unpack("N", $last_string); + $last_offset = $tmp[1]; + $count_string = substr($this->words[$word_id], 8, 4); + $tmp = unpack("N", $count_string); + $count = $tmp[1]; + + + return array($first_offset, $last_offset, $count); + + } + + /** + * Returns documents using the word_docs string of records starting + * at the given offset and using its link-list of records. Traversal of + * the list stops if an offset larger than $last_offset is seen or + * $len many doc's have been returned. Since $next_offset is passed by + * reference the value of $next_offset will point to the next record in + * the list (if it exists) after thhe function is called. + * + * @param int &$next_offset where to start in word docs + * @param int $last_offset offset at which to stop by + * @param int $len number of documents desired + * @return array desired list of doc's and their info + */ + function getWordSlice(&$next_offset, $last_offset, $len) + { + $num_docs_so_far = 0; + $num_doc_or_links = ($next_offset > 0) ? $last_offset/$next_offset + : 1; //very approx + $results = array(); + do { + if($next_offset >= $this->word_docs_len) {break;} + $item = array(); + $doc_string = substr($this->word_docs, $next_offset, 4); + $tmp = unpack("N", $doc_string); + $doc_int = $tmp[1]; + $occurrences = $doc_int & 255; + $doc_index = ($doc_int >> 8); + $next_string = substr($this->word_docs, $next_offset + 4, 4); + $tmp = unpack("N", $next_string); + $old_next_offset = $next_offset; + $next_offset += $tmp[1]; + $doc_depth = log(10*(($doc_index +1) + + $this->generation_offset)*NUM_FETCHERS, 10); + $item[self::DOC_RANK] = number_format(11 - + $doc_depth, PRECISION); + $doc_loc = $doc_index << 4; + $doc_info_string = substr($this->doc_infos, $doc_loc, + 12); + $doc_id = substr($doc_info_string, 0, 8); + $tmp = unpack("N", substr($doc_info_string, 8, 4)); + $item[self::SUMMARY_OFFSET] = $tmp[1]; + $is_doc = false; + $skip_stats = false; + + if($item[self::SUMMARY_OFFSET] == 0x7FFFFFFF) { + $skip_stats = true; + $item[self::DUPLICATE] = true; + } else if(($tmp[1] & self::COMPOSITE_ID_FLAG) !== 0) { + //handles link item case + $item[self::SUMMARY_OFFSET] ^= self::COMPOSITE_ID_FLAG; + $doc_loc += 12; + $doc_info_string = substr($this->doc_infos, $doc_loc, 16); + $doc_id .= ":". + substr($doc_info_string, 0, 8).":". + substr($doc_info_string, 8, 8); + $average_doc_len = ($this->num_link_docs != 0) ? + $this->len_all_link_docs/$this->num_link_docs : 0; + $num_docs = $this->num_link_docs; + } else { + $is_doc = true; + $average_doc_len = $this->len_all_docs/$this->num_docs; + $num_docs = $this->num_docs; + } + + if(!$skip_stats) { + $tmp = unpack("N", substr($this->doc_infos, $doc_loc + 12, 4)); + $doc_len = $tmp[1]; + $doc_ratio = ($average_doc_len > 0) ? + $doc_len/$average_doc_len : 0; + $pre_relevance = number_format( + 3 * $occurrences/ + ($occurrences + .5 + 1.5* $doc_ratio), + PRECISION); + $num_term_occurrences = $num_doc_or_links * + $num_docs/($this->num_docs + $this->num_link_docs); + $IDF = ($num_docs - $num_term_occurrences + 0.5) / + ($num_term_occurrences + 0.5); + $item[self::RELEVANCE] = $IDF * $pre_relevance; + $item[self::SCORE] = $item[self::DOC_RANK] + + .1*$item[self::RELEVANCE]; + } + $results[$doc_id] = $item; + $num_docs_so_far ++; + + } while ($next_offset<= $last_offset && $num_docs_so_far < $len + && $next_offset > $old_next_offset); + + return $results; + } + + + /** + * Returns $len many documents which contained the word corresponding to + * $word_id + * + * @param string $word_id key to look up documents for + * @param int number of documents desired back (from start of word linked + * list). + * @return array desired list of doc's and their info + */ + function getWordSliceById($word_id, $len) + { + $results = array(); + if(isset($this->words[$word_id])) { + list($first_offset, $last_offset, + $num_docs_or_links) = $this->getWordInfo($word_id, true); + $results = $this->getWordSlice($first_offset, $last_offset, $len); + } + return $results; + } + + /** + * Adds the contents of the supplied $index_shard to the current index + * shard + * + * @param object &$index_shard the shard to append to the current shard + */ + function appendIndexShard(&$index_shard) + { + $this->doc_infos .= $index_shard->doc_infos; + $this->word_docs .= $index_shard->word_docs; + $old_word_docs_len = $this->word_docs_len; + $this->word_docs_len += $index_shard->word_docs_len; + // update doc offsets in word_docs for newly added docs + for($i = $old_word_docs_len; $i < $this->word_docs_len; $i += 8) { + $doc_occurrences_string = substr($this->word_docs, $i, 4); + $tmp = unpack("N", $doc_occurrences_string); + $num = $tmp[1]; + $num += ($this->docids_len << 4); + $doc_occurrences_string = pack("N", $num); + charCopy($doc_occurrences_string, $this->word_docs, $i, 4); + } + + foreach($index_shard->words as $word_key => $word_docs_offset) { + $add_first_string = substr($word_docs_offset, 0, 4); + $tmp = unpack("N", $add_first_string); + $add_first_offset = $tmp[1]; + $add_last_string = substr($word_docs_offset, 4, 4); + $tmp = unpack("N", $add_last_string); + $add_last_offset = $tmp[1]; + $add_count = substr($word_docs_offset, 8, 4); + $tmp = unpack("N", $add_count); + $add_count = $tmp[1]; + if(!isset($this->words[$word_key])) { + $new_word_docs_offset = + pack("N", $old_word_docs_len + $add_first_offset). + pack("N", $old_word_docs_len + $add_last_offset). + pack("N", $add_count); + } else { + $value = $this->words[$word_key]; + $first_string = substr($value, 0, 4); + $last_string = substr($value, 4, 4); + $tmp = unpack("N", $last_string); + $last_offset = $tmp[1]; + $count_string = substr($value, 8, 4); + $tmp = unpack("N", $count_string); + $count = $tmp[1]; + if($count == 0x7FFFFFFF) { + + continue; } + $to_new_docs_offset = $add_first_offset + + ($old_word_docs_len - $last_offset); + $to_new_docs_string = pack("N", $to_new_docs_offset); + charCopy($to_new_docs_string, $this->word_docs, + $last_offset + 4, 4); + $new_word_docs_offset = $first_string . + pack("N", $old_word_docs_len + $add_last_offset) . + pack("N", $count + $add_count); } + $this->words[$word_key] = $new_word_docs_offset; } - return $result; + $this->docids_len += $index_shard->docids_len; + $this->num_docs += $index_shard->num_docs; + $this->num_link_docs += $index_shard->num_link_docs; + $this->len_all_docs += $index_shard->len_all_docs; + $this->len_all_link_docs += $index_shard->len_all_link_docs; } - function appendIndexShard($index_shard) + /** + * Changes the summary offsets associated with a set of doc_ids to new + * values. This is needed because the fetcher puts documents in a + * shard before sending them to a queue_server. It is on the queue_server + * however where documents are stored in the IndexArchiveBundle and + * summary offsets are obtained. Thus, the shard needs to be updated at + * that point. + * + * @param array $docid_offsets a set of doc_id offset pairs. + */ + function changeDocumentOffsets($docid_offsets) { + $docids_len = $this->docids_len; + + for($i = 0 ; $i < $docids_len; $i += $row_len) { + $row_len = 16; + $id = substr($this->doc_infos, $i, 8); + $tmp = unpack("N", substr($this->doc_infos, $i + 8, 4)); + $offset = $tmp[1]; + if($offset == 0x7FFFFFFF) {continue; }//ignore duplicates + $comp_flag = 0; + if(($offset & self::COMPOSITE_ID_FLAG) !== 0) { + //handle link item case + $row_len += 16; + $comp_flag = self::COMPOSITE_ID_FLAG; + $id .= ":".substr($this->doc_infos, $i + 12, 8) . ":" . + substr($this->doc_infos, $i + 20, 8); + } + $new_offset = (isset($docid_offsets[$id])) ? + pack("N", ($docid_offsets[$id] | $comp_flag)) : + pack("N", $offset); + + charCopy($new_offset, $this->doc_infos, $i + 8, 4); + } } - function docCount() + /** + * Marks a set of urls as duplicates of urls previously seen + * To do this the url's doc_id has associated with a summary + * offset of value 0x7FFFFFFF, and its length is set to + * 0XFFFFFFFF + * + * @param array $doc_urls urls to mark as duplicates. + */ + function markDuplicateDocs($doc_urls) { - return ($this->count_doc256 >> 8); + foreach($doc_urls as $duplicate) { + $doc_key = crawlHash($duplicate, true); + $this->doc_infos .= $doc_key . pack("N", 0x7FFFFFFF). + pack("N", 0xFFFFFFFF); + $word_key = crawlHash("info:".$duplicate, true); + $this->word_docs .= pack("N", ($this->docids_len<< 4)).pack("N",0); + $tmp = pack("N", $this->word_docs_len); + $this->words[$word_key] = $tmp.$tmp.pack("N", 0x7FFFFFFF); + $this->word_docs_len += 8; + $this->docids_len += 16; + } + } } diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index 9d2846fe7..1dc3ad5f8 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -80,7 +80,7 @@ class HtmlProcessor extends TextProcessor if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) { //maybe not html? treat as text still try to get urls - $summary = parent::process($page, url); + $summary = parent::process($page, $url); } } } diff --git a/lib/string_array.php b/lib/string_array.php index 6d830f865..8c34fb028 100755 --- a/lib/string_array.php +++ b/lib/string_array.php @@ -38,6 +38,11 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} */ require_once "persistent_structure.php"; +/** + * Load charCopy + */ +require_once "utility.php"; + /** * Memory efficient implementation of persistent arrays * @@ -159,12 +164,10 @@ class StringArray extends PersistentStructure $data_size = $this->data_size; $start = $i * $data_size; - $end = $start + $data_size; - for($j = $start, $k = 0; $j < $end; $j++, $k++) { - $this->string_array[$j] = $data[$k]; - } + charCopy($data, $this->string_array, $start, $data_size); + } - + } ?> diff --git a/lib/utility.php b/lib/utility.php index 376f3cc08..1694054be 100755 --- a/lib/utility.php +++ b/lib/utility.php @@ -22,7 +22,7 @@ * * END LICENSE * - * A library of log, hash, and time functions + * A library of string, log, hash, and time functions * * @author Chris Pollett chris@pollett.org * @package seek_quarry @@ -35,6 +35,46 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} +/** + * + */ +function charCopy($source, &$destination, $start, $length) +{ + $end = $start + $length; + for($j = $start, $k = 0; $j < $end; $j++, $k++) { + $destination[$j] = $source[$k]; + } +} + +/** + * + */ +function vByteEncode($pos_int) +{ + $result = chr($pos_int & 127); + $pos_int >>= 7; + while($pos_int > 0){ + $result .= chr(128 | ($pos_int & 127)); + $pos_int >>= 7; + } + return $result; +} + +/** + * + */ +function vByteDecode(&$str, &$offset) +{ + $pos_int = ord($str[$offset] & 127) ; + $shift = 7; + while (ord($str[$offset++]) & 128 > 0) { + $pos_int += (ord($str[$offset] & 127) << $shift); + $shift += 7; + } + + return $pos_int; +} + /** * Logs a message to a logfile or the screen * diff --git a/models/phrase_model.php b/models/phrase_model.php index b92584645..a604642e5 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -57,7 +57,7 @@ foreach(glob(BASE_DIR."/lib/index_bundle_iterators/*_iterator.php") /** * * This is class is used to handle - * db results for a given phrase search + * results for a given phrase search * * @author Chris Pollett * @package seek_quarry @@ -151,7 +151,7 @@ class PhraseModel extends Model $index_archive = new IndexArchiveBundle( CRAWL_DIR.'/cache/'.$index_archive_name); $word_iterator = - new WordIterator(crawlHash("info:$url"), $index_archive, 0); + new WordIterator(crawlHash("info:$url"), $index_archive); $num_retrieved = 0; $pages = array(); $summary_offset = NULL; @@ -262,6 +262,7 @@ class PhraseModel extends Model $hashes = array_unique($hashes); $restrict_phrases = array_unique($restrict_phrases); $restrict_phrases = array_filter($restrict_phrases); + $index_archive->setCurrentShard(0); $words_array = $index_archive->getSelectiveWords($hashes, 10); if(is_array($words_array)) { @@ -348,12 +349,56 @@ class PhraseModel extends Model * INDEX_ARCHIVE -- an index_archive object to get results from * @param int $limit number of first document in order to return * @param int $num number of documents to return summaries of - * @param object $index_archive index archive to use to get summaries from * @return array document summaries */ function getSummariesByHash($word_structs, $limit, $num) { + $pages = array(); + $generation = 0; + $to_retrieve = $limit + max(2*$num, 200); + $num_retrieved = 0; + while($num_retrieved < $to_retrieve) { + $gen_pages = $this->getGenerationSummariesByHash( + $word_structs, $to_retrieve, $generation); + if(!is_array($gen_pages)) { break; } + $num_retrieved += count($gen_pages); + $pages += $gen_pages; + $generation++; + } + uasort($pages, "scoreOrderCallback"); + $pages = array_slice($pages, $limit, $num); + if($num_retrieved < $to_retrieve ) { + $results['TOTAL_ROWS'] = $num_retrieved; + } else { + $results['TOTAL_ROWS'] = + $num_retrieved; + //num_docs is only approximate, so if gives contradictory info + //use $num_retrieved + } + $results['PAGES'] = & $pages; + return $results; + } + + + /** + * Gets doc summaries of documents containing given words and meeting the + * additional provided criteria in a given index shard generation + * @param array $word_structs an array of word_structs. Here a word_struct + * is an associative array with at least the following fields + * KEYS -- an array of word keys + * RESTRICT_PHRASES -- an array of phrases the document must contain + * DISALLOW_PHRASES -- an array of words the document must not contain + * WEIGHT -- a weight to multiple scores returned from this iterator by + * INDEX_ARCHIVE -- an index_archive object to get results from + * @param int $num number of documents to return summaries of + * @param int $generation the index of the generation to get summaries from + * @return array document summaries + */ + function getGenerationSummariesByHash($word_structs, + $num, $generation) + { + $iterators = array(); foreach($word_structs as $word_struct) { if(!is_array($word_struct)) { continue;} @@ -361,25 +406,29 @@ class PhraseModel extends Model $restrict_phrases = $word_struct["RESTRICT_PHRASES"]; $disallow_phrases = $word_struct["DISALLOW_PHRASES"]; $index_archive = $word_struct["INDEX_ARCHIVE"]; + if($generation > $index_archive->generation_info['ACTIVE']) { + continue; + } + $index_archive->setCurrentShard($generation); $weight = $word_struct["WEIGHT"]; $num_word_keys = count($word_keys); if($num_word_keys < 1) {continue;} for($i = 0; $i < $num_word_keys; $i++) { $word_iterators[$i] = - new WordIterator($word_keys[$i], $index_archive, 0); + new WordIterator($word_keys[$i], $index_archive); } if($num_word_keys == 1) { $base_iterator = $word_iterators[0]; } else { - $base_iterator = new IntersectIterator($word_iterators, 0); + $base_iterator = new IntersectIterator($word_iterators); } if($restrict_phrases == NULL && $disallow_phrases == NULL && $weight == 1) { $iterators[] = $base_iterator; } else { $iterators[] = new PhraseFilterIterator($base_iterator, - $restrict_phrases, $disallow_phrases, $weight, 0); + $restrict_phrases, $disallow_phrases, $weight); } } @@ -389,38 +438,26 @@ class PhraseModel extends Model } else if($num_iterators == 1) { $union_iterator = $iterators[0]; } else { - $union_iterator = new UnionIterator($iterators, 0); + $union_iterator = new UnionIterator($iterators); } - $to_retrieve = $limit + max(2*$num, 200); - $group_iterator = new GroupIterator($union_iterator, 0); + $to_retrieve = max(2*$num, 200); + $group_iterator = new GroupIterator($union_iterator); $num_retrieved = 0; $pages = array(); while(is_array($next_docs = $group_iterator->nextDocsWithWord()) && - $num_retrieved < $to_retrieve) { + $num_retrieved < $num) { foreach($next_docs as $doc_key => $doc_info) { $summary = & $doc_info[CrawlConstants::SUMMARY]; unset($doc_info[CrawlConstants::SUMMARY]); $pages[] = array_merge($doc_info, $summary); $num_retrieved++; if($num_retrieved >= $to_retrieve) { - break 2; } } } - uasort($pages, "scoreOrderCallback"); - $pages = array_slice($pages, $limit, $num); - if($num_retrieved < $to_retrieve && $limit<=$group_iterator->num_docs) { - $results['TOTAL_ROWS'] = $num_retrieved; - } else { - $results['TOTAL_ROWS'] = max($group_iterator->num_docs, - $num_retrieved); - /*num_docs is only approximate, so if gives contradictory info - use $num_retrieved */ - } - $results['PAGES'] = $pages; - return $results; + return $pages; } } diff --git a/views/search_view.php b/views/search_view.php index 71cadab4b..b0f5b1472 100755 --- a/views/search_view.php +++ b/views/search_view.php @@ -84,6 +84,8 @@ class SearchView extends View implements CrawlConstants <div class="searchbox"> <form id="searchForm" method="get" action=''> <p> + <input type="hidden" name="YIOOP_TOKEN" value="<?php + e($data['YIOOP_TOKEN']); ?>" /> <input type="hidden" name="its" value="<?php e($data['its']); ?>" /> <input type="text" title="<?php e(tl('search_view_input_label')); ?>" id="search-name" name="q" value="<?php if(isset($data['QUERY'])) {