Fixes more bugs in archive iterators, adding documentation, a=chris

Chris Pollett [2013-04-04 19:Apr:th]
Fixes more bugs in archive iterators, adding documentation, a=chris
Filename
bin/fetcher.php
controllers/fetch_controller.php
lib/archive_bundle_iterators/database_bundle_iterator.php
lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
lib/archive_bundle_iterators/text_archive_bundle_iterator.php
lib/crawl_constants.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 8354039f1..04a63907e 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -573,7 +573,9 @@ class Fetcher implements CrawlConstants

                 crawlLog("New name: ".$this->web_archive->dir_name);
                 crawlLog("Switching archive...");
-                continue;
+                if(!isset($info[self::ARC_DATA])) {
+                    continue;
+                }
             }

             switch($this->crawl_type)
@@ -603,7 +605,9 @@ class Fetcher implements CrawlConstants
             crawlLog("Number of summarized pages ".
                 count($summarized_site_pages));

-            $this->updateFoundSites($summarized_site_pages);
+            $force_send = (isset($info[self::END_ITERATOR]) &&
+                $info[self::END_ITERATOR]) ? true : false;
+            $this->updateFoundSites($summarized_site_pages, $force_send);

             $sleep_time = max(0, ceil(
                 MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time)));
@@ -1047,10 +1051,13 @@ class Fetcher implements CrawlConstants
                 crawlLog("Time to get archive data from local buffer ".
                     changeInMicrotime($start_time));
             }
-            if($archive_iterator->buffer_fh && $archive_iterator->current_offset
-                < $max_offset) {
+            if($archive_iterator->buffer_fh
+                && $archive_iterator->current_offset < $max_offset ) {
                 return $info;
             }
+            if(isset($info[self::ARC_DATA]) && count($info[self::ARC_DATA])>0){
+                $arc_data = $info[self::ARC_DATA];
+            }
             crawlLog("Done processing Local Buffer, requesting more data...");
         }
         crawlLog("Fetching Archive data from name server with request:");
@@ -1099,6 +1106,9 @@ class Fetcher implements CrawlConstants
                         $archive_iterator->nextPages(1);
                     }
                 }
+                if(isset($arc_data)) {
+                    $info[self::ARC_DATA] = $arc_data;
+                }
             } else {
                 $info[self::ARC_DATA] = $pages;
             }
@@ -1106,7 +1116,6 @@ class Fetcher implements CrawlConstants

         crawlLog("Time to fetch archive data from name server ".
             changeInMicrotime($start_time));
-
         return $info;
     }

@@ -1745,8 +1754,10 @@ class Fetcher implements CrawlConstants
      * the queue server is called with the data.
      *
      * @param array $sites site data to use for the update
+     * @param bool $force_send whether to force send data back to queue_server
+     *      or rely on usual thresholds before sending
      */
-    function updateFoundSites($sites)
+    function updateFoundSites($sites, $force_send = false)
     {
         $start_time = microtime();

@@ -1818,7 +1829,7 @@ class Fetcher implements CrawlConstants
             crawlLog($site_index.". $subdoc_info ".$site[self::URL]);

         } // end for
-        if(($this->crawl_type == self::WEB_CRAWL &&
+        if($force_send || ($this->crawl_type == self::WEB_CRAWL &&
             count($this->to_crawl) <= 0 && count($this->to_crawl_again) <= 0) ||
                 (isset($this->found_sites[self::SEEN_URLS]) &&
                 count($this->found_sites[self::SEEN_URLS]) >
diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php
index e84bf25da..2bc8a8003 100755
--- a/controllers/fetch_controller.php
+++ b/controllers/fetch_controller.php
@@ -249,7 +249,10 @@ class FetchController extends Controller implements CrawlConstants
             }
             @unlink($lock_filename);
         }
-         if (($chunk && $pages) || ($pages && !empty($pages))) {
+        if($archive_iterator->end_of_iterator) {
+            $info[self::END_ITERATOR] = true;
+        }
+        if (($chunk && $pages) || ($pages && !empty($pages))) {
             $pages_string = webencode(gzcompress(serialize($pages)));
         } else {
             $info[self::STATUS] = self::NO_DATA_STATE;
diff --git a/lib/archive_bundle_iterators/database_bundle_iterator.php b/lib/archive_bundle_iterators/database_bundle_iterator.php
index 90c486ac5..d3fefebe7 100644
--- a/lib/archive_bundle_iterators/database_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/database_bundle_iterator.php
@@ -44,7 +44,7 @@ require_once BASE_DIR.'/lib/utility.php';

 /**
  * Used to iterate through the records that result from an SQL query to a
- *  database
+ * database
  *
  * @author Chris Pollett
  * @package seek_quarry
@@ -60,11 +60,32 @@ class DatabaseBundleIterator extends ArchiveBundleIterator
      * @var string
      */
     var $iterate_dir;
+
+    /**
+     * SQL query whose records we are index
+     * @var string
+     */
+    var $sql;
+
+    /**
+     * DB Records are imported as a text string where column_separator
+     * is used to delimit the end of a column
+     * @var string
+     */
+    var $column_separator;
+
+    /**
+     * For a given DB record each column is converted to a string:
+     * name_of_column field_value_separator value_of_column
+     * @var string
+     */
+    var $field_value_separator;
+
     /**
-     * The path to the directory where the iteration status is stored.
+     * What character encoding is used for the DB records
      * @var string
      */
-    var $result_dir;
+    var $encoding;

     /**
      *  File handle for current arc file
diff --git a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
index d3c0d4d35..2cb7a4a60 100644
--- a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
@@ -71,13 +71,6 @@ class MixArchiveBundleIterator extends ArchiveBundleIterator
      */
     var $mix_timestamp;

-    /**
-     * Used to hold timestamp of the index archive bundle of output results
-     *
-     * @var int
-     */
-    var $result_timestamp;
-
     /**
      * count of how far our into the crawl mix we've gone.
      *
diff --git a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
index a7e128583..a7d749572 100644
--- a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
@@ -63,11 +63,6 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
      * @var string
      */
     var $iterate_dir;
-    /**
-     * The path to the directory where the iteration status is stored.
-     * @var string
-     */
-    var $result_dir;
     /**
      * The number of arc files in this arc archive bundle
      *  @var int
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 93849f92d..986853079 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -218,6 +218,7 @@ interface CrawlConstants
     const INI = 'cq';
     const UI_FLAGS = 'cr';
     const KEYWORD_LINKS = 'cs';
+    const END_ITERATOR = 'ct';

     const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;
ViewGit