refactoring archive iterators and make extend archive_bundle_iterator, a=chris

Chris Pollett [2012-03-16 22:Mar:th]
refactoring archive iterators and make extend archive_bundle_iterator, a=chris
Filename
lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
lib/archive_bundle_iterators/archive_bundle_iterator.php
lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
lib/archive_bundle_iterators/web_archive_bundle_iterator.php
diff --git a/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php b/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
index 9739fb9ff..cd704ed70 100644
--- a/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
@@ -50,7 +50,8 @@ require_once BASE_DIR.
  * @subpackage iterator
  * @see WebArchiveBundle
  */
-class ArcArchiveBundleIterator implements CrawlConstants
+class ArcArchiveBundleIterator extends ArchiveBundleIterator
+    implements CrawlConstants
 {
     /**
      * The number of arc files in this arc archive bundle
@@ -80,11 +81,7 @@ class ArcArchiveBundleIterator implements CrawlConstants
      *  @var resource
      */
     var $fh;
-    /**
-     * The fetcher prefix associated with this archive.
-     * @var string
-     */
-    var $fetcher_prefix;
+

     /**
      * Creates a arc archive iterator with the given parameters.
@@ -217,18 +214,5 @@ class ArcArchiveBundleIterator implements CrawlConstants
         return $site;
     }

-    /**
-     * Returns the path to an archive given its timestamp.
-     *
-     * @param string $timestamp the archive timestamp
-     * @return string the path to the archive, based off of the fetcher prefix
-     *     used when this iterator was constructed
-     */
-    function get_archive_name($timestamp)
-    {
-        return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
-            self::archive_base_name.$timestamp;
-    }
-
 }
 ?>
diff --git a/lib/archive_bundle_iterators/archive_bundle_iterator.php b/lib/archive_bundle_iterators/archive_bundle_iterator.php
index d1a5bf740..7a965f666 100644
--- a/lib/archive_bundle_iterators/archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/archive_bundle_iterator.php
@@ -48,8 +48,6 @@ require_once BASE_DIR."/lib/crawl_constants.php";
  */
 abstract class ArchiveBundleIterator implements CrawlConstants
 {
-
-
     /**
      * Timestamp of the archive that is being iterated over
      * @var int
@@ -68,6 +66,25 @@ abstract class ArchiveBundleIterator implements CrawlConstants
      */
      var $end_of_iterator;

+    /**
+     * The fetcher prefix associated with this archive.
+     * @var string
+     */
+    var $fetcher_prefix;
+
+    /**
+     * Returns the path to an archive given its timestamp.
+     *
+     * @param string $timestamp the archive timestamp
+     * @return string the path to the archive, based off of the fetcher prefix
+     *     used when this iterator was constructed
+     */
+    function get_archive_name($timestamp)
+    {
+        return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
+            self::archive_base_name.$timestamp;
+    }
+
     /**
      * Estimates the important of the site according to the weighting of
      * the particular archive iterator
diff --git a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
index d1ff02d59..fbbd165da 100644
--- a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
@@ -50,7 +50,8 @@ require_once BASE_DIR.
  * @subpackage iterator
  * @see WebArchiveBundle
  */
-class MediaWikiArchiveBundleIterator implements CrawlConstants
+class MediaWikiArchiveBundleIterator extends ArchiveBundleIterator
+    implements CrawlConstants
 {
     /**
      * The number of arc files in this arc archive bundle
@@ -98,11 +99,6 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
      *  @var resource
      */
     var $fh;
-    /**
-     * The fetcher prefix associated with this archive.
-     * @var string
-     */
-    var $fetcher_prefix;

     /**
      * Start state of FSA for lexing media wiki docs
@@ -602,18 +598,5 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
         return array($token, $state, $pos);
     }

-    /**
-     * Returns the path to an archive given its timestamp.
-     *
-     * @param string $timestamp the archive timestamp
-     * @return string the path to the archive, based off of the fetcher prefix
-     *     used when this iterator was constructed
-     */
-    function get_archive_name($timestamp)
-    {
-        return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
-            self::archive_base_name.$timestamp;
-    }
-
 }
 ?>
diff --git a/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php b/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
index 3054b3da9..75d74e290 100644
--- a/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
@@ -50,7 +50,8 @@ require_once BASE_DIR.
  * @subpackage iterator
  * @see WebArchiveBundle
  */
-class OdpRdfArchiveBundleIterator implements CrawlConstants
+class OdpRdfArchiveBundleIterator extends ArchiveBundleIterator
+    implements CrawlConstants
 {
     /**
      * The number of arc files in this arc archive bundle
@@ -91,11 +92,6 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants
      *  @var resource
      */
     var $fh;
-    /**
-     * The fetcher prefix associated with this archive.
-     * @var string
-     */
-    var $fetcher_prefix;

     /**
      * How many bytes to read into buffer from bz2 stream in one go
@@ -476,17 +472,5 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants
         return $html;
     }

-    /**
-     * Returns the path to an archive given its timestamp.
-     *
-     * @param string $timestamp the archive timestamp
-     * @return string the path to the archive, based off of the fetcher prefix
-     *     used when this iterator was constructed
-     */
-    function get_archive_name($timestamp)
-    {
-        return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
-            self::archive_base_name.$timestamp;
-    }
 }
 ?>
diff --git a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
index ec3ab280d..6479b99de 100644
--- a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
@@ -49,7 +49,8 @@ require_once BASE_DIR.
  * @subpackage iterator
  * @see WebArchiveBundle
  */
-class WebArchiveBundleIterator implements CrawlConstants
+class WebArchiveBundleIterator extends ArchiveBundleIterator
+    implements CrawlConstants
 {

     /**
@@ -88,11 +89,6 @@ class WebArchiveBundleIterator implements CrawlConstants
      * @var object
      */
     var $archive;
-    /**
-     * The fetcher prefix associated with this archive.
-     * @var string
-     */
-    var $fetcher_prefix;

     /**
      * Creates a web archive iterator with the given parameters.
@@ -206,19 +202,5 @@ class WebArchiveBundleIterator implements CrawlConstants
         $archive_name = $this->get_archive_name($this->result_timestamp);
         @unlink("$archive_name/iterate_status.txt");
     }
-
-    /**
-     * Returns the path to an archive given its timestamp.
-     *
-     * @param string $timestamp the archive timestamp
-     * @return string the path to the archive, based off of the fetcher prefix
-     *     used when this iterator was constructed
-     */
-    function get_archive_name($timestamp)
-    {
-        return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
-            self::archive_base_name.$timestamp;
-    }
-
 }
 ?>
ViewGit