Fixes more bugs in archive iterators, adding documentation, a=chris
Fixes more bugs in archive iterators, adding documentation, a=chris
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 8354039f1..04a63907e 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -573,7 +573,9 @@ class Fetcher implements CrawlConstants
crawlLog("New name: ".$this->web_archive->dir_name);
crawlLog("Switching archive...");
- continue;
+ if(!isset($info[self::ARC_DATA])) {
+ continue;
+ }
}
switch($this->crawl_type)
@@ -603,7 +605,9 @@ class Fetcher implements CrawlConstants
crawlLog("Number of summarized pages ".
count($summarized_site_pages));
- $this->updateFoundSites($summarized_site_pages);
+ $force_send = (isset($info[self::END_ITERATOR]) &&
+ $info[self::END_ITERATOR]) ? true : false;
+ $this->updateFoundSites($summarized_site_pages, $force_send);
$sleep_time = max(0, ceil(
MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time)));
@@ -1047,10 +1051,13 @@ class Fetcher implements CrawlConstants
crawlLog("Time to get archive data from local buffer ".
changeInMicrotime($start_time));
}
- if($archive_iterator->buffer_fh && $archive_iterator->current_offset
- < $max_offset) {
+ if($archive_iterator->buffer_fh
+ && $archive_iterator->current_offset < $max_offset ) {
return $info;
}
+ if(isset($info[self::ARC_DATA]) && count($info[self::ARC_DATA])>0){
+ $arc_data = $info[self::ARC_DATA];
+ }
crawlLog("Done processing Local Buffer, requesting more data...");
}
crawlLog("Fetching Archive data from name server with request:");
@@ -1099,6 +1106,9 @@ class Fetcher implements CrawlConstants
$archive_iterator->nextPages(1);
}
}
+ if(isset($arc_data)) {
+ $info[self::ARC_DATA] = $arc_data;
+ }
} else {
$info[self::ARC_DATA] = $pages;
}
@@ -1106,7 +1116,6 @@ class Fetcher implements CrawlConstants
crawlLog("Time to fetch archive data from name server ".
changeInMicrotime($start_time));
-
return $info;
}
@@ -1745,8 +1754,10 @@ class Fetcher implements CrawlConstants
* the queue server is called with the data.
*
* @param array $sites site data to use for the update
+ * @param bool $force_send whether to force send data back to queue_server
+ * or rely on usual thresholds before sending
*/
- function updateFoundSites($sites)
+ function updateFoundSites($sites, $force_send = false)
{
$start_time = microtime();
@@ -1818,7 +1829,7 @@ class Fetcher implements CrawlConstants
crawlLog($site_index.". $subdoc_info ".$site[self::URL]);
} // end for
- if(($this->crawl_type == self::WEB_CRAWL &&
+ if($force_send || ($this->crawl_type == self::WEB_CRAWL &&
count($this->to_crawl) <= 0 && count($this->to_crawl_again) <= 0) ||
(isset($this->found_sites[self::SEEN_URLS]) &&
count($this->found_sites[self::SEEN_URLS]) >
diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php
index e84bf25da..2bc8a8003 100755
--- a/controllers/fetch_controller.php
+++ b/controllers/fetch_controller.php
@@ -249,7 +249,10 @@ class FetchController extends Controller implements CrawlConstants
}
@unlink($lock_filename);
}
- if (($chunk && $pages) || ($pages && !empty($pages))) {
+ if($archive_iterator->end_of_iterator) {
+ $info[self::END_ITERATOR] = true;
+ }
+ if (($chunk && $pages) || ($pages && !empty($pages))) {
$pages_string = webencode(gzcompress(serialize($pages)));
} else {
$info[self::STATUS] = self::NO_DATA_STATE;
diff --git a/lib/archive_bundle_iterators/database_bundle_iterator.php b/lib/archive_bundle_iterators/database_bundle_iterator.php
index 90c486ac5..d3fefebe7 100644
--- a/lib/archive_bundle_iterators/database_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/database_bundle_iterator.php
@@ -44,7 +44,7 @@ require_once BASE_DIR.'/lib/utility.php';
/**
* Used to iterate through the records that result from an SQL query to a
- * database
+ * database
*
* @author Chris Pollett
* @package seek_quarry
@@ -60,11 +60,32 @@ class DatabaseBundleIterator extends ArchiveBundleIterator
* @var string
*/
var $iterate_dir;
+
+ /**
+ * SQL query whose records we are index
+ * @var string
+ */
+ var $sql;
+
+ /**
+ * DB Records are imported as a text string where column_separator
+ * is used to delimit the end of a column
+ * @var string
+ */
+ var $column_separator;
+
+ /**
+ * For a given DB record each column is converted to a string:
+ * name_of_column field_value_separator value_of_column
+ * @var string
+ */
+ var $field_value_separator;
+
/**
- * The path to the directory where the iteration status is stored.
+ * What character encoding is used for the DB records
* @var string
*/
- var $result_dir;
+ var $encoding;
/**
* File handle for current arc file
diff --git a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
index d3c0d4d35..2cb7a4a60 100644
--- a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
@@ -71,13 +71,6 @@ class MixArchiveBundleIterator extends ArchiveBundleIterator
*/
var $mix_timestamp;
- /**
- * Used to hold timestamp of the index archive bundle of output results
- *
- * @var int
- */
- var $result_timestamp;
-
/**
* count of how far our into the crawl mix we've gone.
*
diff --git a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
index a7e128583..a7d749572 100644
--- a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
@@ -63,11 +63,6 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
* @var string
*/
var $iterate_dir;
- /**
- * The path to the directory where the iteration status is stored.
- * @var string
- */
- var $result_dir;
/**
* The number of arc files in this arc archive bundle
* @var int
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 93849f92d..986853079 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -218,6 +218,7 @@ interface CrawlConstants
const INI = 'cq';
const UI_FLAGS = 'cr';
const KEYWORD_LINKS = 'cs';
+ const END_ITERATOR = 'ct';
const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;