diff --git a/bin/fetcher.php b/bin/fetcher.php index 544180d27..b39049612 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -36,7 +36,7 @@ define("BASE_DIR", substr( dirname(realpath($_SERVER['PHP_SELF'])), 0, -strlen("/bin"))); -ini_set("memory_limit","700M"); //so have enough memory to crawl big pages +ini_set("memory_limit","750M"); //so have enough memory to crawl big pages /** Load in global configuration settings */ require_once BASE_DIR.'/configs/config.php'; diff --git a/configs/config.php b/configs/config.php index baed0fd16..3d976f46b 100755 --- a/configs/config.php +++ b/configs/config.php @@ -43,7 +43,7 @@ date_default_timezone_set('America/Los_Angeles'); /*+++ The next block of code is machine edited, change at your own risk, please use configure web page instead +++*/ -define('WORK_DIRECTORY', ''); +define('WORK_DIRECTORY', '/Applications/xampp/xamppfiles/htdocs/crawls'); /*++++++*/ if(file_exists(WORK_DIRECTORY."/profile.php")) { diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index 93b49211c..2d2e30bbc 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -167,7 +167,6 @@ class GroupIterator extends IndexBundleIterator } } } else { - $pre_out_pages[$hash_url][] = $doc_info; if($doc_info['IS_PAGE'] == true) { $pre_out_pages[$hash_url]['IS_PAGE'] = true; @@ -190,9 +189,10 @@ class GroupIterator extends IndexBundleIterator if(is_array($doc_array) && count($doc_array) == 1) { $keys = array_keys($doc_array); $key = $keys[0]; - if(!isset($doc_array[$key][self::DUPLICATE]) ) { + if(!isset($doc_array[$key][self::DUPLICATE]) ) {; $pre_out_pages[$hash_url][$key] = $doc_array[$key]; $pre_out_pages[$hash_url][$key]['IS_PAGE'] = true; + $pre_out_pages[$hash_url][$key]['KEY'] = $key; } else { /* Deduplication: @@ -218,7 +218,7 @@ class GroupIterator extends IndexBundleIterator foreach($group_infos as $doc_info) { $is_page = $doc_info['IS_PAGE']; unset($doc_info['IS_PAGE']); - if(!isset($out_pages[$hash_url])) { + if(!isset($out_pages[$hash_url]) || $is_page) { $out_pages[$hash_url] = $doc_info; $out_pages[$hash_url][self::SUMMARY_OFFSET] = array(); if(isset($doc_info[self::SUMMARY_OFFSET]) ) { diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php index 62f36b691..31f18f3a8 100644 --- a/lib/index_bundle_iterators/intersect_iterator.php +++ b/lib/index_bundle_iterators/intersect_iterator.php @@ -97,7 +97,7 @@ class IntersectIterator extends IndexBundleIterator $this->index_bundle_iterators = $index_bundle_iterators; $this->num_iterators = count($index_bundle_iterators); - $this->num_docs = -1; + $this->num_docs = 0; $this->results_per_block = 1; /* @@ -107,10 +107,7 @@ class IntersectIterator extends IndexBundleIterator iterator */ for($i = 0; $i < $this->num_iterators; $i++) { - if( $this->num_docs < 0 || - $this->index_bundle_iterators[$i]->num_docs < $this->num_docs) { - $this->num_docs = $this->index_bundle_iterators[$i]->num_docs; - } + $this->num_docs += $this->index_bundle_iterators[$i]->num_docs; $this->index_bundle_iterators[$i]->setResultsPerBlock(1); } $this->reset(); @@ -157,19 +154,20 @@ class IntersectIterator extends IndexBundleIterator function syncDocOffsetsAmongstIterators() { $biggest_offset = 0; - $all_same = true; do{ + $all_same = true; for($i = 0; $i < $this->num_iterators; $i++) { - $new_doc_offset = - $this->index_bundle_iterators[$i]->currentDocOffsetWithWord(); + $new_doc_offset[$i] = + $this->index_bundle_iterators[ + $i]->currentDocOffsetWithWord(); if($i == 0) { - $biggest_offset = $new_doc_offset; + $biggest_offset = $new_doc_offset[$i]; } - if($new_doc_offset == -1) { + if($new_doc_offset[$i] == -1) { return -1; } - if($new_doc_offset > $biggest_offset) { - $biggest_offset = $new_doc_offset; + if($new_doc_offset[$i] > $biggest_offset) { + $biggest_offset = $new_doc_offset[$i]; $all_same = false; } } @@ -177,7 +175,9 @@ class IntersectIterator extends IndexBundleIterator return 1; } for($i = 0; $i < $this->num_iterators; $i++) { - $this->index_bundle_iterators[$i]->advance($biggest_offset); + if($new_doc_offset[$i] < $biggest_offset) { + $this->index_bundle_iterators[$i]->advance($biggest_offset); + } } } while(!$all_same); } @@ -198,15 +198,13 @@ class IntersectIterator extends IndexBundleIterator for($i = 0; $i < $this->num_iterators; $i++) { $this->seen_docs_unfiltered += $this->index_bundle_iterators[$i]->seen_docs; - $total_num_docs = $this->index_bundle_iterators[$i]->num_docs; + $total_num_docs += $this->index_bundle_iterators[$i]->num_docs; } if($this->seen_docs_unfiltered > 0) { $this->num_docs = floor(($this->seen_docs * $total_num_docs) / $this->seen_docs_unfiltered); - } else { - $this->num_docs = 0; - } + } $this->index_bundle_iterators[0]->advance($doc_offset); diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php index 42db86f03..25e4846c7 100644 --- a/lib/index_bundle_iterators/word_iterator.php +++ b/lib/index_bundle_iterators/word_iterator.php @@ -171,6 +171,7 @@ class WordIterator extends IndexBundleIterator return -1; } $this->next_offset = $this->current_offset; + //the next call also updates next offset $results = $this->index->getCurrentShard()->getPostingsSlice( $this->next_offset, $this->last_offset, $this->results_per_block); return $results; @@ -192,6 +193,10 @@ class WordIterator extends IndexBundleIterator $this->index->getCurrentShard( )->nextPostingOffsetDocOffset($this->next_offset, $this->last_offset, $doc_offset); + if($this->current_offset === false) { + $this->current_offset = $this->last_offset + 1; + return; + } $this->seen_docs = ($this->current_offset - $this->start_offset)/ IndexShard::POSTING_LEN; diff --git a/lib/index_shard.php b/lib/index_shard.php index 188fb45a1..09b9c2f2e 100644 --- a/lib/index_shard.php +++ b/lib/index_shard.php @@ -356,13 +356,14 @@ class IndexShard extends PersistentStructure implements CrawlConstants ($last_offset - $next_offset) >> 2 : 1; $results = array(); + $end = min($this->word_docs_len, $last_offset); do { - if($next_offset >= $this->word_docs_len) {break;} + if($next_offset > $end) {break;} $item = array(); $posting = $this->getWordDocsSubstring($next_offset, 4); list($doc_index, $occurrences) = $this->unpackPosting($posting); $old_next_offset = $next_offset; - $next_offset += 4; + $next_offset += self::POSTING_LEN; $doc_depth = log(10*(($doc_index +1) + $this->generation_offset)*NUM_FETCHERS, 10); $item[self::DOC_RANK] = number_format(11 - @@ -370,8 +371,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants $doc_loc = $doc_index << 4; $doc_info_string = $this->getDocInfoSubstring($doc_loc, 12); $doc_id = substr($doc_info_string, 0, 8); - $tmp = unpack("N", substr($doc_info_string, 8, 4)); - $item[self::SUMMARY_OFFSET] = $tmp[1]; + $item[self::SUMMARY_OFFSET] = $this->unpackInt( + substr($doc_info_string, 8, 4)); $is_doc = false; $skip_stats = false; @@ -379,7 +380,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants $item[self::SUMMARY_OFFSET] == self::NEEDS_OFFSET_FLAG) { $skip_stats = true; $item[self::DUPLICATE] = true; - } else if(($tmp[1] & self::COMPOSITE_ID_FLAG) !== 0) { + } else if(($item[self::SUMMARY_OFFSET] + & self::COMPOSITE_ID_FLAG) !== 0) { //handles link item case $item[self::SUMMARY_OFFSET] ^= self::COMPOSITE_ID_FLAG; $doc_loc += 12; @@ -410,8 +412,9 @@ class IndexShard extends PersistentStructure implements CrawlConstants $IDF = ($num_docs - $num_term_occurrences + 0.5) / ($num_term_occurrences + 0.5); $item[self::RELEVANCE] = $IDF * $pre_relevance; + $item[self::SCORE] = $item[self::DOC_RANK] + - 0.1*$item[self::RELEVANCE]; + .1/ ($item[self::RELEVANCE] + .1); } $results[$doc_id] = $item; $num_docs_so_far ++; @@ -869,7 +872,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants do { $data = $this->readBlockShardAtOffset($block_offset); - if($data == false) {return $substring;} + if($data === false) {return $substring;} $block_offset += self::SHARD_BLOCK_SIZE; $substring .= substr($data, $start_loc); $start_loc = 0;