diff --git a/controllers/search_controller.php b/controllers/search_controller.php index 04e0f5bcb..a65ffccf5 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -592,9 +592,11 @@ class SearchController extends Controller implements CrawlConstants * no grouping done on data. If $raw == 1 no summary returned (used * with f=serial, end user probably does not want) * In this case, will get offset, generation, etc so could later lookup - * @param int $save_timestamp if this timestamp is nonzero, then save + * @param mixed $save_timestamp if this timestamp is nonzero, then save * iterate position, so can resume on future queries that make - * use of the timestamp + * use of the timestamp. $save_time_stamp may also be in the format + * of string timestamp-query_part to handle networked queries involving + * presentations */ function processQuery(&$data, $query, $activity, $arg, $results_per_page, $limit = 0, $index_name = 0, $raw = 0, $save_timestamp = 0) diff --git a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php index 9c9db61ed..d3c0d4d35 100644 --- a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php +++ b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php @@ -193,9 +193,8 @@ class MixArchiveBundleIterator extends ArchiveBundleIterator $objects = $results["PAGES"]; $this->limit += $num_results; $objects["NO_PROCESS"] = true; - if($num_results < $num - 1) { - $this->end_of_iterator = true; - } + } else if ($num_results == 0) { + $this->end_of_iterator = true; } else { $objects = array("NO_PROCESS" => $results); } diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php index ac747d488..1f56cba17 100644 --- a/lib/index_bundle_iterators/intersect_iterator.php +++ b/lib/index_bundle_iterators/intersect_iterator.php @@ -127,10 +127,19 @@ class IntersectIterator extends IndexBundleIterator up here that we return at most one posting at a time from each iterator */ + $this->seen_docs = 0; + $this->seen_docs_unfiltered = 0; for($i = 0; $i < $this->num_iterators; $i++) { $this->num_docs += $this->index_bundle_iterators[$i]->num_docs; + $this->index_bundle_iterators[$i]->setResultsPerBlock(1); + $this->seen_docs += $this->index_bundle_iterators[$i]->seen_docs; + if(isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)) { + $this->seen_docs_unfiltered += + $this->index_bundle_iterators[$i]->seen_docs_unfiltered; + } else { + $this->seen_docs_unfiltered += $this->seen_docs; + } } - $this->reset(); } /** @@ -522,4 +531,4 @@ class IntersectIterator extends IndexBundleIterator } } } -?> \ No newline at end of file +?> diff --git a/lib/index_bundle_iterators/network_iterator.php b/lib/index_bundle_iterators/network_iterator.php index 91f175411..2686d8137 100644 --- a/lib/index_bundle_iterators/network_iterator.php +++ b/lib/index_bundle_iterators/network_iterator.php @@ -119,18 +119,20 @@ class NetworkIterator extends IndexBundleIterator * archive bundles that we look in for results * @param array $filter an array of hashes of domains to filter from * results - * @param int $save_timestamp if this timestamp is nonzero, then when making - * queries to separate machines the save_timestamp is sent so - * the queries on those machine can make savepoints + * @param string $save_timestamp if this timestamp is nonzero, then when + * making queries to separate machines the save_timestamp is sent so + * the queries on those machine can make savepoints. Note the + * format of save_timestamp is timestamp-query_part where query_part + * is the number of the item in a query presentation (usually 0). */ function __construct($query, $queue_servers, $timestamp, &$filter = NULL, - $save_timestamp = 0) + $save_timestamp_name = "") { $this->results_per_block = ceil(self::MIN_FIND_RESULTS_PER_BLOCK); $this->base_query = "q=".urlencode($query). "&f=serial&network=false&raw=1&its=$timestamp&guess=false"; if($save_timestamp > 0) { // used for archive crawls of crawl mixes - $this->base_query .= "&save_timestamp=$save_timestamp"; + $this->base_query .= "&save_timestamp=$save_timestamp_name"; } $this->queue_servers = $queue_servers; $this->limit = 0; diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php index 23260ffe4..797f36bd7 100644 --- a/lib/index_bundle_iterators/union_iterator.php +++ b/lib/index_bundle_iterators/union_iterator.php @@ -102,6 +102,8 @@ class UnionIterator extends IndexBundleIterator $this->num_docs = 0; $this->results_per_block = 0; $this->key_iterator_table = array(); + $this->seen_docs = 0; + $this->seen_docs_unfiltered = 0; for($i = 0; $i < $this->num_iterators; $i++) { $this->num_docs += $this->index_bundle_iterators[$i]->num_docs; /* @@ -111,8 +113,16 @@ class UnionIterator extends IndexBundleIterator */ $this->results_per_block += $this->index_bundle_iterators[$i]->results_per_block; + $this->seen_docs += $this->index_bundle_iterators[$i]->seen_docs; + if(isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)) { + $this->seen_docs_unfiltered += + $this->index_bundle_iterators[$i]->seen_docs_unfiltered; + } else { + $this->seen_docs_unfiltered += $this->seen_docs; + } } - $this->reset(); + + $doc_block = $this->currentDocsWithWord(); } /** diff --git a/models/parallel_model.php b/models/parallel_model.php index 919ae6de7..df69257ab 100755 --- a/models/parallel_model.php +++ b/models/parallel_model.php @@ -448,9 +448,11 @@ class ParallelModel extends Model implements CrawlConstants return; } - $save_file = CRAWL_DIR.'/cache/savepoint'. - $save_timestamp.".txt"; - @unlink($save_file); + $save_files = glob(CRAWL_DIR.'/cache/Savepoint'. + $save_timestamp."*.txt"); + foreach($save_files as $save_file) { + @unlink($save_file); + } } /** diff --git a/models/phrase_model.php b/models/phrase_model.php index 06da6a4fc..ad57ed8d7 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -282,6 +282,13 @@ class PhraseModel extends ParallelModel } $qpart = 0; + if(is_string($save_timestamp)) { + $save_parts = explode("-",$save_timestamp); + if(isset($save_parts[1])) { + $qpart = intval($save_parts[1]); + $save_timestamp = intval($save_parts[0]); + } + } $orig_stimestamp = $save_timestamp; foreach($query_parts as $phrase => $pre_result_bounds) { @@ -342,10 +349,14 @@ class PhraseModel extends ParallelModel changeInMicrotime($start_time)."<br />"; } - $save_timestamp = intval("$orig_stimestamp$qpart"); + if($orig_stimestamp > 0) { + $save_timestamp_name = "$orig_stimestamp-$qpart"; + } else { + $save_timestamp_name = ""; + } $out_results = $this->getSummariesByHash($word_structs, $low, $phrase_num, $filter, $use_cache_if_allowed, $raw, - $queue_servers, $disjunct, $save_timestamp); + $queue_servers, $disjunct, $save_timestamp_name); if(isset($out_results['PAGES']) && count($out_results['PAGES']) != 0) { $out_count = 0; @@ -888,15 +899,15 @@ class PhraseModel extends ParallelModel * be used during lookup * @param string $original_query if set, the original query that corresponds * to $word_structs - * @param int $save_timestamp if this timestamp is nonzero, then save - * iterate position, so can resume on future queries that make + * @param string $save_timestamp_name if this timestamp is not empty, then + * save iterate position, so can resume on future queries that make * use of the timestamp. If used then $limit ignored and get next $num * docs after $save_timestamp 's previous iterate position. * @return array document summaries */ function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), - $original_query = "", $save_timestamp = 0) + $original_query = "", $save_timestamp_name = "") { global $CACHE; $indent= " "; @@ -917,12 +928,12 @@ class PhraseModel extends ParallelModel self::NUM_CACHE_PAGES; $start_slice = floor(($limit)/self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; - if($save_timestamp > 0) { + if($save_timestamp_name != "") { $to_retrieve = $num; $limit = 0; $start_slice = 0; } - if(USE_CACHE && $save_timestamp == 0) { + if(USE_CACHE && $save_timestamp_name == "") { $mem_tmp = serialize($raw).serialize($word_structs); $summary_hash = crawlHash($mem_tmp.":".$limit.":".$num); @@ -940,7 +951,7 @@ class PhraseModel extends ParallelModel } } $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, - $queue_servers, $original_query, $save_timestamp); + $queue_servers, $original_query, $save_timestamp_name); $num_retrieved = 0; $pages = array(); @@ -952,11 +963,11 @@ class PhraseModel extends ParallelModel $num_retrieved += count($next_docs); } } - if($save_timestamp > 0 && ($queue_servers == array() || + if($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) { // used for archive crawls of crawl mixes $save_file = CRAWL_DIR.'/cache/'.self::save_point. - $save_timestamp.".txt"; + $save_timestamp_name.".txt"; $iterators = $query_iterator->save_iterators; $cnt_iterators = count($iterators); $save_point = array(); @@ -1020,8 +1031,10 @@ class PhraseModel extends ParallelModel //this is only an approximation } - $pages = array_slice($pages, $start_slice); - $pages = array_slice($pages, $limit - $start_slice, $num); + if($save_timestamp_name == "") { + $pages = array_slice($pages, $start_slice); + $pages = array_slice($pages, $limit - $start_slice, $num); + } if($raw == 1) { $results['PAGES'] = & $pages; @@ -1090,7 +1103,7 @@ class PhraseModel extends ParallelModel $format_time = microtime(); } $results['PAGES'] = & $out_pages; - if(USE_CACHE && $save_timestamp == 0) { + if(USE_CACHE && $save_timestamp_name == "") { $CACHE->set($summary_hash, $results); } return $results; @@ -1212,15 +1225,15 @@ class PhraseModel extends ParallelModel * be used during lookup * @param string $original_query if set, the orginal query that corresponds * to $word_structs - * @param int $save_timestamp if this timestamp is nonzero, then when making - * iterator get sub-iterators to advance to gen doc_offset stored - * with respect to save_timestamp if exists. + * @param string $save_timestamp_name if this timestamp is non empty, then + * when making iterator get sub-iterators to advance to gen doc_offset + * stored with respect to save_timestamp if exists. * * @return &object an iterator for iterating through results to the * query */ function getQueryIterator($word_structs, &$filter, $raw = 0, - $queue_servers = array(), $original_query = "", $save_timestamp = 0) + $queue_servers = array(), $original_query = "", $save_timestamp_name="") { $iterators = array(); $total_iterators = 0; @@ -1237,15 +1250,16 @@ class PhraseModel extends ParallelModel $index_name = $this->index_name; } $iterators[0] = new NetworkIterator($original_query, - $queue_servers, $index_name, $filter, $save_timestamp); + $queue_servers, $index_name, $filter, $save_timestamp_name); } if(!$network_flag) { $doc_iterate_hash = crawlHash("site:any"); $doc_iterate_group_hash = crawlHash("site:doc"); - if($save_timestamp > 0) { // used for archive crawls of crawl mixes + if($save_timestamp_name != "") { + // used for archive crawls of crawl mixes $save_file = CRAWL_DIR.'/cache/'.self::save_point. - $save_timestamp.".txt"; + $save_timestamp_name.".txt"; if(file_exists($save_file)) { $save_point = unserialize(file_get_contents($save_file)); @@ -1302,7 +1316,7 @@ class PhraseModel extends ParallelModel $word_iterators, $word_iterator_map, $quote_positions, $weight); } - if($save_timestamp > 0) { + if($save_timestamp_name != "") { if(isset($save_point[$save_count]) && $save_point[$save_count] != -1) { $base_iterator->advance($save_point[$save_count]); @@ -1336,7 +1350,7 @@ class PhraseModel extends ParallelModel $union_iterator->results_per_block = ceil(SERVER_ALPHA * $group_iterator->results_per_block/$num_servers); - } else if($save_timestamp > 0) { + } else if($save_timestamp_name != "") { $group_iterator->save_iterators = $iterators; }