Mainly add documentation for changes to fetcher and archive iterators, also makes it so in archive case fetcher talks less frequently with queue servers, a=chris
Mainly add documentation for changes to fetcher and archive iterators, also makes it so in archive case fetcher talks less frequently with queue servers, a=chris
diff --git a/bin/fetcher.php b/bin/fetcher.php
index c7918493e..8354039f1 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -1111,7 +1111,10 @@ class Fetcher implements CrawlConstants
}
/**
+ * Function to check if memory for this fetcher instance is getting low
+ * relative to what the system will allow.
*
+ * @return bool whether available memory is getting low
*/
function exceedMemoryThreshold()
{
@@ -1119,7 +1122,11 @@ class Fetcher implements CrawlConstants
}
/**
+ * At least once, and while memory is low picks at server at random and send
+ * any fetcher data we have to it.
*
+ * @param bool $at_least_once whether to send to at least one fetcher or
+ * to only send if memory is above threshold
*/
function selectCurrentServerAndUpdateIfNeeded($at_least_once)
{
@@ -1589,7 +1596,15 @@ class Fetcher implements CrawlConstants
}
/**
+ * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
+ * This method attempts to cull from the doc_info struct the
+ * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing
+ * links which of filetype or sites the crawler is forbidden from crawl.
+ * Then a crude estimate of the informaation contained in the links test:
+ * strlen(gzip(text)) is used to extract the best remaining links.
*
+ * @param array &$doc_info a string with a CrawlConstants::LINKS subarray
+ * This subarray in turn contains url => text pairs.
*/
function pruneLinks(&$doc_info)
{
@@ -1803,8 +1818,9 @@ class Fetcher implements CrawlConstants
crawlLog($site_index.". $subdoc_info ".$site[self::URL]);
} // end for
- if((count($this->to_crawl) <= 0 && count($this->to_crawl_again) <= 0) ||
- ( isset($this->found_sites[self::SEEN_URLS]) &&
+ if(($this->crawl_type == self::WEB_CRAWL &&
+ count($this->to_crawl) <= 0 && count($this->to_crawl_again) <= 0) ||
+ (isset($this->found_sites[self::SEEN_URLS]) &&
count($this->found_sites[self::SEEN_URLS]) >
SEEN_URLS_BEFORE_UPDATE_SCHEDULER) ||
($this->archive_iterator &&
diff --git a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
index d3bd5b134..1028ed8fe 100644
--- a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
@@ -150,14 +150,18 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
var $buffer_filename;
/**
- * Name of
+ * Name of function to be call whenever the partition is changed
+ * that the iterator is reading. The point of the callback is to
+ * read meta information at the start of the new partition
*
* @var string
*/
var $switch_partition_callback_name = NULL;
/**
- * Name of
+ * Contains basic parameters of how this iterate works: compression,
+ * start and stop delimiter. Typically, this data is read from the
+ * arc_description.ini file
*
* @var array
*/
@@ -167,8 +171,15 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
* How many bytes at a time should be read from the current archive
* file into the buffer file. 8192 = BZip2BlockIteraror::BlOCK_SIZE
*/
- const BUFFER_SIZE = 16384000; //32768000;
+ const BUFFER_SIZE = 16384000;
+ /**
+ * Estimate of the maximum size of a record stored in a text archive
+ * Data in archives is split into chunk of buffer size plus two record
+ * sizes. This is used to provide a two record overlap between successive
+ * chunks. This si further used to ensure that records that go over
+ * the basic chunk boundary of BUFFER_SIZE will be processed.
+ */
const MAX_RECORD_SIZE = 49152;
/**
@@ -229,7 +240,11 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
}
/**
+ * Mutator Method for controller how this text archive iterator behaves
+ * Normally, data, on compression, start, stop delimiter read from an ini
+ * file. This reads it from the supplied array.
*
+ * @param array $ini configuration settings for this archive iterator
*/
function setIniInfo($ini)
{
@@ -293,7 +308,13 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
}
/**
+ * Called to get the next chunk of BUFFER_SIZE + 2 MAX_RECORD_SIZE bytes
+ * of data from the text archive. This data is returned unprocessed in
+ * self::ARC_DATA together with ini and header information about the
+ * archive. This method is typically called in the name server setting
+ * from FetchController.
*
+ * @return array with contents as described above
*/
function nextChunk()
{