Fix recrawls to work with fetcher prefixes

Shawn Tice [2012-03-12 21:Mar:th]
Fix recrawls to work with fetcher prefixes

The fetcher always has a prefix (default 0), and the web archive bundle
iterator takes the prefix as a parameter. The arc_tool also expects a prefix
when looking in the cache directory for existing archives.

Signed-off-by: Chris Pollett <chris@pollett.org>
Filename
bin/arc_tool.php
bin/fetcher.php
lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
lib/archive_bundle_iterators/web_archive_bundle_iterator.php
diff --git a/bin/arc_tool.php b/bin/arc_tool.php
index faa0d561f..fe673bcf6 100755
--- a/bin/arc_tool.php
+++ b/bin/arc_tool.php
@@ -169,7 +169,7 @@ class ArcTool implements CrawlConstants
      */
      function outputArchiveList()
      {
-        $pattern = CRAWL_DIR."/cache/{".self::archive_base_name.",".
+        $pattern = CRAWL_DIR."/cache/*-{".self::archive_base_name.",".
             self::index_data_base_name."}*";

         $archives = glob($pattern, GLOB_BRACE);
diff --git a/bin/fetcher.php b/bin/fetcher.php
index edaab2418..0bc182572 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -355,11 +355,10 @@ class Fetcher implements CrawlConstants
         CrawlDaemon::init($argv, "fetcher");
         if(isset($argv[2]) ) {
             $this->fetcher_num = intval($argv[2]);
-            crawlLog("\n\nInitialize logger..", $this->fetcher_num."-fetcher");
-
         } else {
-            crawlLog("\n\nInitialize logger..", "fetcher");
+            $this->fetcher_num = 0;
         }
+        crawlLog("\n\nInitialize logger..", $this->fetcher_num."-fetcher");

         $this->loop();
     }
@@ -377,10 +376,7 @@ class Fetcher implements CrawlConstants
     {
         crawlLog("In Fetch Loop");

-        $prefix = "";
-        if($this->fetcher_num !== false) {
-            $prefix = $this->fetcher_num."-";
-        }
+        $prefix = $this->fetcher_num."-";
         if(!file_exists(CRAWL_DIR."/{$prefix}temp")) {
             mkdir(CRAWL_DIR."/{$prefix}temp");
         }
@@ -512,10 +508,7 @@ class Fetcher implements CrawlConstants
             return array();
         }

-        $prefix = "";
-        if($this->fetcher_num !== false) {
-            $prefix = $this->fetcher_num."-";
-        }
+        $prefix = $this->fetcher_num."-";
         $tmp_dir = CRAWL_DIR."/{$prefix}temp";
         $site_pages = FetchUrl::getPages($sites, true,
             $this->page_range_request, $tmp_dir
@@ -552,11 +545,8 @@ class Fetcher implements CrawlConstants
      */
     function downloadPagesArchiveCrawl()
     {
-        $prefix = "";
-        if($this->fetcher_num !== false) {
-            $prefix = $this->fetcher_num."-";
-        }
-        $base_name = CRAWL_DIR.'/cache/{$prefix}'.self::archive_base_name.
+        $prefix = $this->fetcher_num."-";
+        $base_name = CRAWL_DIR."/cache/{$prefix}".self::archive_base_name.
             $this->crawl_index;
         $pages = array();
         if(!isset($this->archive_iterator->iterate_timestamp) ||
@@ -575,7 +565,8 @@ class Fetcher implements CrawlConstants
                 }
                 $iterator_name = $arctype."Iterator";
                 $this->archive_iterator =
-                    new $iterator_name($this->crawl_index, $this->crawl_time);
+                    new $iterator_name($prefix, $this->crawl_index,
+                        $this->crawl_time);
                 if($this->archive_iterator == NULL) {
                     crawlLog("Error creating archive iterator!!");
                     return $pages;
@@ -597,10 +588,7 @@ class Fetcher implements CrawlConstants
      */
     function deleteOldCrawls(&$still_active_crawls)
     {
-        $prefix = "";
-        if($this->fetcher_num !== false) {
-            $prefix = $this->fetcher_num."-";
-        }
+        $prefix = $this->fetcher_num."-";
         $dirs = glob(CRAWL_DIR.'/cache/*', GLOB_ONLYDIR);

         $full_base_name = $prefix . self::archive_base_name;
@@ -653,10 +641,7 @@ class Fetcher implements CrawlConstants
         $time = time();
         $session = md5($time . AUTH_KEY);

-        $prefix = "";
-        if($this->fetcher_num !== false) {
-            $prefix = $this->fetcher_num."-";
-        }
+        $prefix = $this->fetcher_num."-";

         /* if just restarted, check to make sure the crawl hasn't changed,
            if it has bail
@@ -742,10 +727,7 @@ class Fetcher implements CrawlConstants
      */
     function checkScheduler()
     {
-        $prefix = "";
-        if($this->fetcher_num !== false) {
-            $prefix = $this->fetcher_num."-";
-        }
+        $prefix = $this->fetcher_num."-";

         $info = array();
         if((count($this->to_crawl) > 0 || count($this->to_crawl_again) > 0) &&
@@ -987,10 +969,7 @@ class Fetcher implements CrawlConstants
         crawlLog("  Start process pages...");
         $start_time = microtime();

-        $prefix = "";
-        if($this->fetcher_num !== false) {
-            $prefix = $this->fetcher_num."-";
-        }
+        $prefix = $this->fetcher_num."-";

         $stored_site_pages = array();
         $summarized_site_pages = array();
@@ -1573,10 +1552,7 @@ class Fetcher implements CrawlConstants
     {
         $queue_server = $this->queue_servers[$this->current_server];

-        $prefix = "";
-        if($this->fetcher_num !== false) {
-            $prefix = $this->fetcher_num."-";
-        }
+        $prefix = $this->fetcher_num."-";

         if(count($this->to_crawl) <= 0) {
             $schedule_time = $this->schedule_time;
diff --git a/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php b/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
index 7120991d4..9739fb9ff 100644
--- a/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
@@ -80,6 +80,11 @@ class ArcArchiveBundleIterator implements CrawlConstants
      *  @var resource
      */
     var $fh;
+    /**
+     * The fetcher prefix associated with this archive.
+     * @var string
+     */
+    var $fetcher_prefix;

     /**
      * Creates a arc archive iterator with the given parameters.
@@ -89,12 +94,12 @@ class ArcArchiveBundleIterator implements CrawlConstants
      * @param string $result_timestamp timestamp of the arc archive bundle
      *      results are being stored in
      */
-    function __construct($iterate_timestamp, $result_timestamp)
+    function __construct($prefix, $iterate_timestamp, $result_timestamp)
     {
+        $this->fetcher_prefix = $prefix;
         $this->iterate_timestamp = $iterate_timestamp;
         $this->result_timestamp = $result_timestamp;
-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $iterate_timestamp;
+        $archive_name = $this->get_archive_name($iterate_timestamp);
         $this->partitions = array();
         foreach(glob("$archive_name/*.arc.gz") as $filename) {
             $this->partitions[] = $filename;
@@ -113,9 +118,6 @@ class ArcArchiveBundleIterator implements CrawlConstants
         } else {
             $this->reset();
         }
-
-
-
     }

     /**
@@ -139,8 +141,7 @@ class ArcArchiveBundleIterator implements CrawlConstants
         $this->end_of_iterator = false;
         $this->current_offset = 0;
         $this->fh = NULL;
-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $this->result_timestamp;
+        $archive_name = $this->get_archive_name($this->result_timestamp);
         @unlink("$archive_name/iterate_status.txt");
     }

@@ -176,8 +177,7 @@ class ArcArchiveBundleIterator implements CrawlConstants
             $this->current_offset = gztell($this->fh);
         }

-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $this->result_timestamp;
+        $archive_name = $this->get_archive_name($this->result_timestamp);
         $info = array();
         $info['end_of_iterator'] = $this->end_of_iterator;
         $info['current_partition_num'] = $this->current_partition_num;
@@ -217,5 +217,18 @@ class ArcArchiveBundleIterator implements CrawlConstants
         return $site;
     }

+    /**
+     * Returns the path to an archive given its timestamp.
+     *
+     * @param string $timestamp the archive timestamp
+     * @return string the path to the archive, based off of the fetcher prefix
+     *     used when this iterator was constructed
+     */
+    function get_archive_name($timestamp)
+    {
+        return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
+            self::archive_base_name.$timestamp;
+    }
+
 }
 ?>
diff --git a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
index 7e965924d..d1ff02d59 100644
--- a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
@@ -98,6 +98,11 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
      *  @var resource
      */
     var $fh;
+    /**
+     * The fetcher prefix associated with this archive.
+     * @var string
+     */
+    var $fetcher_prefix;

     /**
      * Start state of FSA for lexing media wiki docs
@@ -140,12 +145,12 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
      * @param string $result_timestamp timestamp of the arc archive bundle
      *      results are being stored in
      */
-    function __construct($iterate_timestamp, $result_timestamp)
+    function __construct($prefix, $iterate_timestamp, $result_timestamp)
     {
+        $this->fetcher_prefix = $prefix;
         $this->iterate_timestamp = $iterate_timestamp;
         $this->result_timestamp = $result_timestamp;
-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $iterate_timestamp;
+        $archive_name = $this->get_archive_name($iterate_timestamp);
         $this->partitions = array();
         foreach(glob("$archive_name/*.xml.bz2") as $filename) {
             $this->partitions[] = $filename;
@@ -262,8 +267,7 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
         $this->current_offset = 0;
         $this->fh = NULL;
         $this->buffer = "";
-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $this->result_timestamp;
+        $archive_name = $this->get_archive_name($this->result_timestamp);
         @unlink("$archive_name/iterate_status.txt");
     }

@@ -320,8 +324,7 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
             $this->current_page_num += $page_count;
         }

-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $this->result_timestamp;
+        $archive_name = $this->get_archive_name($this->result_timestamp);
         $info = array();
         $info['end_of_iterator'] = $this->end_of_iterator;
         $info['current_partition_num'] = $this->current_partition_num;
@@ -598,5 +601,19 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
         } while($continue && $pos < $len);
         return array($token, $state, $pos);
     }
+
+    /**
+     * Returns the path to an archive given its timestamp.
+     *
+     * @param string $timestamp the archive timestamp
+     * @return string the path to the archive, based off of the fetcher prefix
+     *     used when this iterator was constructed
+     */
+    function get_archive_name($timestamp)
+    {
+        return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
+            self::archive_base_name.$timestamp;
+    }
+
 }
 ?>
diff --git a/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php b/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
index f822df2fe..3054b3da9 100644
--- a/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
@@ -91,10 +91,17 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants
      *  @var resource
      */
     var $fh;
+    /**
+     * The fetcher prefix associated with this archive.
+     * @var string
+     */
+    var $fetcher_prefix;
+
     /**
      * How many bytes to read into buffer from bz2 stream in one go
      */
     const BLOCK_SIZE = 8192;
+
     /**
      * Creates an open directory rdf archive iterator with the given parameters.
      *
@@ -103,12 +110,12 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants
      * @param string $result_timestamp timestamp of the arc archive bundle
      *      results are being stored in
      */
-    function __construct($iterate_timestamp, $result_timestamp)
+    function __construct($prefix, $iterate_timestamp, $result_timestamp)
     {
+        $this->fetcher_prefix = $prefix;
         $this->iterate_timestamp = $iterate_timestamp;
         $this->result_timestamp = $result_timestamp;
-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $iterate_timestamp;
+        $archive_name = $this->get_archive_name($iterate_timestamp);
         $this->partitions = array();
         foreach(glob("$archive_name/*.gz") as $filename) {
             $this->partitions[] = $filename;
@@ -263,8 +270,7 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants
         $this->current_offset = 0;
         $this->fh = NULL;
         $this->buffer = "";
-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $this->result_timestamp;
+        $archive_name = $this->get_archive_name($this->result_timestamp);
         @unlink("$archive_name/iterate_status.txt");
     }

@@ -316,8 +322,7 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants
             $this->current_page_num += $page_count;
         }

-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $this->result_timestamp;
+        $archive_name = $this->get_archive_name($this->result_timestamp);
         $info = array();
         $info['end_of_iterator'] = $this->end_of_iterator;
         $info['current_partition_num'] = $this->current_partition_num;
@@ -470,5 +475,18 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants
         }
         return $html;
     }
+
+    /**
+     * Returns the path to an archive given its timestamp.
+     *
+     * @param string $timestamp the archive timestamp
+     * @return string the path to the archive, based off of the fetcher prefix
+     *     used when this iterator was constructed
+     */
+    function get_archive_name($timestamp)
+    {
+        return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
+            self::archive_base_name.$timestamp;
+    }
 }
 ?>
diff --git a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
index c8ea2505d..ec3ab280d 100644
--- a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
@@ -88,24 +88,29 @@ class WebArchiveBundleIterator implements CrawlConstants
      * @var object
      */
     var $archive;
+    /**
+     * The fetcher prefix associated with this archive.
+     * @var string
+     */
+    var $fetcher_prefix;

     /**
      * Creates a web archive iterator with the given parameters.
      *
+     * @param string $prefix fetcher number this bundle is associated with
      * @param string $iterate_timestamp timestamp of the web archive bundle to
      *      iterate  over the pages of
      * @param string $result_timestamp timestamp of the web archive bundle
      *      results are being stored in
      */
-    function __construct($iterate_timestamp, $result_timestamp)
+    function __construct($prefix, $iterate_timestamp, $result_timestamp)
     {
+        $this->fetcher_prefix = $prefix;
         $this->iterate_timestamp = $iterate_timestamp;
         $this->result_timestamp = $result_timestamp;
-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $iterate_timestamp;
+        $archive_name = $this->get_archive_name($iterate_timestamp);
         $this->archive = new WebArchiveBundle($archive_name);
-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $result_timestamp;
+        $archive_name = $this->get_archive_name($result_timestamp);
         if(file_exists("$archive_name/iterate_status.txt")) {
             $info = unserialize(file_get_contents(
                 "$archive_name/iterate_status.txt"));
@@ -170,8 +175,7 @@ class WebArchiveBundleIterator implements CrawlConstants
         $this->end_of_iterator = ($this->overall_index >= $this->count ) ?
             true : false;

-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $this->result_timestamp;
+        $archive_name = $this->get_archive_name($this->result_timestamp);
         $info = array();
         $info['overall_index'] = $this->overall_index;
         $info['end_of_iterator'] = $this->end_of_iterator;
@@ -199,10 +203,22 @@ class WebArchiveBundleIterator implements CrawlConstants
         $this->partition = $this->archive->getPartition(
             $this->current_partition_num, false);
         $this->partition->reset();
-        $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name.
-            $this->result_timestamp;
+        $archive_name = $this->get_archive_name($this->result_timestamp);
         @unlink("$archive_name/iterate_status.txt");
     }

+    /**
+     * Returns the path to an archive given its timestamp.
+     *
+     * @param string $timestamp the archive timestamp
+     * @return string the path to the archive, based off of the fetcher prefix
+     *     used when this iterator was constructed
+     */
+    function get_archive_name($timestamp)
+    {
+        return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
+            self::archive_base_name.$timestamp;
+    }
+
 }
 ?>
ViewGit