Mainly add documentation for changes to fetcher and archive iterators, also makes it so in archive case fetcher talks less frequently with queue servers, a=chris

Chris Pollett [2013-03-25 22:Mar:th]

Mainly add documentation for changes to fetcher and archive iterators, also makes it so in archive case fetcher talks less frequently with queue servers, a=chris

Filename
bin/fetcher.php
lib/archive_bundle_iterators/text_archive_bundle_iterator.php

diff --git a/bin/fetcher.php b/bin/fetcher.php
index c7918493e..8354039f1 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -1111,7 +1111,10 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * Function to check if memory for this fetcher instance is getting low
+     * relative to what the system will allow.
      *
+     * @return bool whether available memory is getting low
      */
     function exceedMemoryThreshold()
     {
@@ -1119,7 +1122,11 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * At least once, and while memory is low picks at server at random and send
+     * any fetcher data we have to it.
      *
+     * @param bool $at_least_once whether to send to at least one fetcher or
+     *      to only send if memory is above threshold
      */
     function selectCurrentServerAndUpdateIfNeeded($at_least_once)
     {
@@ -1589,7 +1596,15 @@ class Fetcher implements CrawlConstants
     }

     /**
+     * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
+     * This method attempts to cull from the doc_info struct the
+     * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing
+     * links which of filetype or sites the crawler is forbidden from crawl.
+     * Then a crude estimate of the informaation contained in the links test:
+     * strlen(gzip(text)) is used to extract the best remaining links.
      *
+     * @param array &$doc_info a string with a CrawlConstants::LINKS subarray
+     *  This subarray in turn contains url => text pairs.
      */
     function pruneLinks(&$doc_info)
     {
@@ -1803,8 +1818,9 @@ class Fetcher implements CrawlConstants
             crawlLog($site_index.". $subdoc_info ".$site[self::URL]);

         } // end for
-        if((count($this->to_crawl) <= 0 && count($this->to_crawl_again) <= 0) ||
-                ( isset($this->found_sites[self::SEEN_URLS]) &&
+        if(($this->crawl_type == self::WEB_CRAWL &&
+            count($this->to_crawl) <= 0 && count($this->to_crawl_again) <= 0) ||
+                (isset($this->found_sites[self::SEEN_URLS]) &&
                 count($this->found_sites[self::SEEN_URLS]) >
                 SEEN_URLS_BEFORE_UPDATE_SCHEDULER) ||
                 ($this->archive_iterator &&
diff --git a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
index d3bd5b134..1028ed8fe 100644
--- a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
@@ -150,14 +150,18 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
     var $buffer_filename;

     /**
-     * Name of
+     * Name of function to be call whenever the partition is changed
+     * that the iterator is reading. The point of the callback is to
+     * read meta information at the start of the new partition
      *
      * @var string
      */
     var $switch_partition_callback_name = NULL;

     /**
-     * Name of
+     * Contains basic parameters of how this iterate works: compression,
+     * start and stop delimiter. Typically, this data is read from the
+     * arc_description.ini file
      *
      * @var array
      */
@@ -167,8 +171,15 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
      * How many bytes at a time should be read from the current archive
      * file into the buffer file. 8192 = BZip2BlockIteraror::BlOCK_SIZE
      */
-    const BUFFER_SIZE = 16384000; //32768000;
+    const BUFFER_SIZE = 16384000;

+    /**
+     *  Estimate of the maximum size of a record stored in a text archive
+     *  Data in archives is split into chunk of buffer size plus two record
+     *  sizes. This is used to provide a two record overlap between successive
+     *  chunks. This si further used to ensure that records that go over
+     *  the basic chunk boundary of BUFFER_SIZE will be processed.
+     */
     const MAX_RECORD_SIZE = 49152;

     /**
@@ -229,7 +240,11 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
     }

     /**
+     *  Mutator Method for controller how this text archive iterator behaves
+     *  Normally, data, on compression, start, stop delimiter read from an ini
+     *  file. This reads it from the supplied array.
      *
+     *  @param array $ini configuration settings for this archive iterator
      */
     function setIniInfo($ini)
     {
@@ -293,7 +308,13 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
     }

     /**
+     *  Called to get the next chunk of BUFFER_SIZE + 2 MAX_RECORD_SIZE bytes
+     *  of data from the text archive. This data is returned unprocessed in
+     *  self::ARC_DATA together with ini and header information about the
+     *  archive. This method is typically called in the name server setting
+     *  from FetchController.
      *
+     *  @return array with contents as described above
      */
     function nextChunk()
     {

ViewGit