Fixes bugs in MixArchiveIterato, a=chris

Chris Pollett [2013-03-22 21:Mar:nd]

Fixes bugs in MixArchiveIterato, a=chris

Filename
controllers/search_controller.php
lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
lib/index_bundle_iterators/intersect_iterator.php
lib/index_bundle_iterators/network_iterator.php
lib/index_bundle_iterators/union_iterator.php
models/parallel_model.php
models/phrase_model.php

diff --git a/controllers/search_controller.php b/controllers/search_controller.php
index 04e0f5bcb..a65ffccf5 100755
--- a/controllers/search_controller.php
+++ b/controllers/search_controller.php
@@ -592,9 +592,11 @@ class SearchController extends Controller implements CrawlConstants
      *      no grouping done on data. If $raw == 1 no summary returned (used
      *      with f=serial, end user probably does not want)
      *      In this case, will get offset, generation, etc so could later lookup
-     * @param int $save_timestamp if this timestamp is nonzero, then save
+     * @param mixed $save_timestamp if this timestamp is nonzero, then save
      *      iterate position, so can resume on future queries that make
-     *      use of the timestamp
+     *      use of the timestamp. $save_time_stamp may also be in the format
+     *      of string timestamp-query_part to handle networked queries involving
+     *      presentations
      */
     function processQuery(&$data, $query, $activity, $arg, $results_per_page,
         $limit = 0, $index_name = 0, $raw = 0, $save_timestamp = 0)
diff --git a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
index 9c9db61ed..d3c0d4d35 100644
--- a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
@@ -193,9 +193,8 @@ class MixArchiveBundleIterator extends ArchiveBundleIterator
             $objects = $results["PAGES"];
             $this->limit += $num_results;
             $objects["NO_PROCESS"] = true;
-            if($num_results < $num - 1) {
-                $this->end_of_iterator = true;
-            }
+        } else if ($num_results == 0) {
+            $this->end_of_iterator = true;
         } else {
             $objects = array("NO_PROCESS" => $results);
         }
diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index ac747d488..1f56cba17 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -127,10 +127,19 @@ class IntersectIterator extends IndexBundleIterator
              up here that we return at most one posting at a time from each
              iterator
         */
+        $this->seen_docs = 0;
+        $this->seen_docs_unfiltered = 0;
         for($i = 0; $i < $this->num_iterators; $i++) {
             $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
+            $this->index_bundle_iterators[$i]->setResultsPerBlock(1);
+            $this->seen_docs += $this->index_bundle_iterators[$i]->seen_docs;
+            if(isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)) {
+                $this->seen_docs_unfiltered +=
+                    $this->index_bundle_iterators[$i]->seen_docs_unfiltered;
+            } else {
+                $this->seen_docs_unfiltered += $this->seen_docs;
+            }
         }
-        $this->reset();
     }

     /**
@@ -522,4 +531,4 @@ class IntersectIterator extends IndexBundleIterator
         }
      }
 }
-?>
\ No newline at end of file
+?>
diff --git a/lib/index_bundle_iterators/network_iterator.php b/lib/index_bundle_iterators/network_iterator.php
index 91f175411..2686d8137 100644
--- a/lib/index_bundle_iterators/network_iterator.php
+++ b/lib/index_bundle_iterators/network_iterator.php
@@ -119,18 +119,20 @@ class NetworkIterator extends IndexBundleIterator
      *      archive bundles that we look in for results
      * @param array $filter an array of hashes of domains to filter from
      *      results
-     * @param int $save_timestamp if this timestamp is nonzero, then when making
-     *      queries to separate machines the save_timestamp is sent so
-     *      the queries on those machine can make savepoints
+     * @param string $save_timestamp if this timestamp is nonzero, then when
+     *      making queries to separate machines the save_timestamp is sent so
+     *      the queries on those machine can make savepoints. Note the
+     *      format of save_timestamp is timestamp-query_part where query_part
+     *      is the number of the item in a query presentation (usually 0).
      */
     function __construct($query, $queue_servers, $timestamp, &$filter = NULL,
-        $save_timestamp = 0)
+        $save_timestamp_name = "")
     {
         $this->results_per_block = ceil(self::MIN_FIND_RESULTS_PER_BLOCK);
         $this->base_query = "q=".urlencode($query).
             "&f=serial&network=false&raw=1&its=$timestamp&guess=false";
         if($save_timestamp > 0) { // used for archive crawls of crawl mixes
-            $this->base_query .= "&save_timestamp=$save_timestamp";
+            $this->base_query .= "&save_timestamp=$save_timestamp_name";
         }
         $this->queue_servers = $queue_servers;
         $this->limit = 0;
diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php
index 23260ffe4..797f36bd7 100644
--- a/lib/index_bundle_iterators/union_iterator.php
+++ b/lib/index_bundle_iterators/union_iterator.php
@@ -102,6 +102,8 @@ class UnionIterator extends IndexBundleIterator
         $this->num_docs = 0;
         $this->results_per_block = 0;
         $this->key_iterator_table = array();
+        $this->seen_docs = 0;
+        $this->seen_docs_unfiltered = 0;
         for($i = 0; $i < $this->num_iterators; $i++) {
             $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
             /*
@@ -111,8 +113,16 @@ class UnionIterator extends IndexBundleIterator
              */
             $this->results_per_block +=
                 $this->index_bundle_iterators[$i]->results_per_block;
+            $this->seen_docs += $this->index_bundle_iterators[$i]->seen_docs;
+            if(isset($this->index_bundle_iterators[$i]->seen_docs_unfiltered)) {
+                $this->seen_docs_unfiltered +=
+                    $this->index_bundle_iterators[$i]->seen_docs_unfiltered;
+            } else {
+                $this->seen_docs_unfiltered += $this->seen_docs;
+            }
         }
-        $this->reset();
+
+        $doc_block = $this->currentDocsWithWord();
     }

     /**
diff --git a/models/parallel_model.php b/models/parallel_model.php
index 919ae6de7..df69257ab 100755
--- a/models/parallel_model.php
+++ b/models/parallel_model.php
@@ -448,9 +448,11 @@ class ParallelModel extends Model implements CrawlConstants
             return;
         }

-        $save_file = CRAWL_DIR.'/cache/savepoint'.
-            $save_timestamp.".txt";
-        @unlink($save_file);
+        $save_files = glob(CRAWL_DIR.'/cache/Savepoint'.
+            $save_timestamp."*.txt");
+        foreach($save_files as $save_file) {
+            @unlink($save_file);
+        }
     }

     /**
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 06da6a4fc..ad57ed8d7 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -282,6 +282,13 @@ class PhraseModel extends ParallelModel
         }

         $qpart = 0;
+        if(is_string($save_timestamp)) {
+            $save_parts = explode("-",$save_timestamp);
+            if(isset($save_parts[1])) {
+                $qpart = intval($save_parts[1]);
+                $save_timestamp = intval($save_parts[0]);
+            }
+        }
         $orig_stimestamp = $save_timestamp;
         foreach($query_parts as $phrase => $pre_result_bounds) {

@@ -342,10 +349,14 @@ class PhraseModel extends ParallelModel
                     changeInMicrotime($start_time)."<br />";
             }

-            $save_timestamp = intval("$orig_stimestamp$qpart");
+            if($orig_stimestamp > 0) {
+                $save_timestamp_name = "$orig_stimestamp-$qpart";
+            } else {
+                $save_timestamp_name = "";
+            }
             $out_results = $this->getSummariesByHash($word_structs,
                 $low, $phrase_num, $filter, $use_cache_if_allowed, $raw,
-                $queue_servers, $disjunct, $save_timestamp);
+                $queue_servers, $disjunct, $save_timestamp_name);
             if(isset($out_results['PAGES']) &&
                 count($out_results['PAGES']) != 0) {
                 $out_count = 0;
@@ -888,15 +899,15 @@ class PhraseModel extends ParallelModel
      *      be used during lookup
      * @param string $original_query if set, the original query that corresponds
      *      to $word_structs
-     * @param int $save_timestamp if this timestamp is nonzero, then save
-     *      iterate position, so can resume on future queries that make
+     * @param string $save_timestamp_name if this timestamp is not empty, then
+     *      save iterate position, so can resume on future queries that make
      *      use of the timestamp. If used then $limit ignored and get next $num
      *      docs after $save_timestamp 's previous iterate position.
      * @return array document summaries
      */
     function getSummariesByHash($word_structs, $limit, $num, &$filter,
         $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(),
-        $original_query = "", $save_timestamp = 0)
+        $original_query = "", $save_timestamp_name = "")
     {
         global $CACHE;
         $indent= "&nbsp;&nbsp;";
@@ -917,12 +928,12 @@ class PhraseModel extends ParallelModel
             self::NUM_CACHE_PAGES;
         $start_slice = floor(($limit)/self::NUM_CACHE_PAGES) *
             self::NUM_CACHE_PAGES;
-        if($save_timestamp > 0) {
+        if($save_timestamp_name != "") {
             $to_retrieve = $num;
             $limit = 0;
             $start_slice = 0;
         }
-        if(USE_CACHE && $save_timestamp == 0) {
+        if(USE_CACHE && $save_timestamp_name == "") {
             $mem_tmp = serialize($raw).serialize($word_structs);

             $summary_hash = crawlHash($mem_tmp.":".$limit.":".$num);
@@ -940,7 +951,7 @@ class PhraseModel extends ParallelModel
             }
         }
         $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw,
-             $queue_servers, $original_query, $save_timestamp);
+             $queue_servers, $original_query, $save_timestamp_name);
         $num_retrieved = 0;
         $pages = array();

@@ -952,11 +963,11 @@ class PhraseModel extends ParallelModel
                 $num_retrieved += count($next_docs);
             }
         }
-        if($save_timestamp > 0 && ($queue_servers == array() ||
+        if($save_timestamp_name != "" && ($queue_servers == array() ||
             $this->isSingleLocalhost($queue_servers))) {
             // used for archive crawls of crawl mixes
             $save_file = CRAWL_DIR.'/cache/'.self::save_point.
-            $save_timestamp.".txt";
+                $save_timestamp_name.".txt";
             $iterators = $query_iterator->save_iterators;
             $cnt_iterators = count($iterators);
             $save_point = array();
@@ -1020,8 +1031,10 @@ class PhraseModel extends ParallelModel
             //this is only an approximation
         }

-        $pages = array_slice($pages, $start_slice);
-        $pages = array_slice($pages, $limit - $start_slice, $num);
+        if($save_timestamp_name == "") {
+            $pages = array_slice($pages, $start_slice);
+            $pages = array_slice($pages, $limit - $start_slice, $num);
+        }

         if($raw == 1) {
             $results['PAGES'] = & $pages;
@@ -1090,7 +1103,7 @@ class PhraseModel extends ParallelModel
             $format_time = microtime();
         }
         $results['PAGES'] = & $out_pages;
-        if(USE_CACHE && $save_timestamp == 0) {
+        if(USE_CACHE && $save_timestamp_name == "") {
             $CACHE->set($summary_hash, $results);
         }
         return $results;
@@ -1212,15 +1225,15 @@ class PhraseModel extends ParallelModel
      *      be used during lookup
      * @param string $original_query if set, the orginal query that corresponds
      *      to $word_structs
-     * @param int $save_timestamp if this timestamp is nonzero, then when making
-     *      iterator get sub-iterators to advance to gen doc_offset stored
-     *      with respect to save_timestamp if exists.
+     * @param string $save_timestamp_name if this timestamp is non empty, then
+     *      when making iterator get sub-iterators to advance to gen doc_offset
+     *      stored with respect to save_timestamp if exists.
      *
      * @return &object an iterator for iterating through results to the
      *  query
      */
     function getQueryIterator($word_structs, &$filter, $raw = 0,
-        $queue_servers = array(), $original_query = "", $save_timestamp = 0)
+        $queue_servers = array(), $original_query = "", $save_timestamp_name="")
     {
         $iterators = array();
         $total_iterators = 0;
@@ -1237,15 +1250,16 @@ class PhraseModel extends ParallelModel
                 $index_name = $this->index_name;
             }
             $iterators[0] = new NetworkIterator($original_query,
-                $queue_servers, $index_name, $filter, $save_timestamp);
+                $queue_servers, $index_name, $filter, $save_timestamp_name);

         }
         if(!$network_flag) {
             $doc_iterate_hash = crawlHash("site:any");
             $doc_iterate_group_hash = crawlHash("site:doc");
-            if($save_timestamp > 0) { // used for archive crawls of crawl mixes
+            if($save_timestamp_name != "") {
+                // used for archive crawls of crawl mixes
                 $save_file = CRAWL_DIR.'/cache/'.self::save_point.
-                    $save_timestamp.".txt";
+                    $save_timestamp_name.".txt";
                 if(file_exists($save_file)) {
                     $save_point =
                         unserialize(file_get_contents($save_file));
@@ -1302,7 +1316,7 @@ class PhraseModel extends ParallelModel
                         $word_iterators, $word_iterator_map, $quote_positions,
                         $weight);
                 }
-                if($save_timestamp > 0) {
+                if($save_timestamp_name != "") {
                     if(isset($save_point[$save_count]) &&
                         $save_point[$save_count] != -1) {
                         $base_iterator->advance($save_point[$save_count]);
@@ -1336,7 +1350,7 @@ class PhraseModel extends ParallelModel
             $union_iterator->results_per_block =
                 ceil(SERVER_ALPHA *
                     $group_iterator->results_per_block/$num_servers);
-        } else if($save_timestamp > 0) {
+        } else if($save_timestamp_name != "") {
             $group_iterator->save_iterators = $iterators;
         }

ViewGit