refactoring archive iterators and make extend archive_bundle_iterator, a=chris
refactoring archive iterators and make extend archive_bundle_iterator, a=chris
diff --git a/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php b/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
index 9739fb9ff..cd704ed70 100644
--- a/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php
@@ -50,7 +50,8 @@ require_once BASE_DIR.
* @subpackage iterator
* @see WebArchiveBundle
*/
-class ArcArchiveBundleIterator implements CrawlConstants
+class ArcArchiveBundleIterator extends ArchiveBundleIterator
+ implements CrawlConstants
{
/**
* The number of arc files in this arc archive bundle
@@ -80,11 +81,7 @@ class ArcArchiveBundleIterator implements CrawlConstants
* @var resource
*/
var $fh;
- /**
- * The fetcher prefix associated with this archive.
- * @var string
- */
- var $fetcher_prefix;
+
/**
* Creates a arc archive iterator with the given parameters.
@@ -217,18 +214,5 @@ class ArcArchiveBundleIterator implements CrawlConstants
return $site;
}
- /**
- * Returns the path to an archive given its timestamp.
- *
- * @param string $timestamp the archive timestamp
- * @return string the path to the archive, based off of the fetcher prefix
- * used when this iterator was constructed
- */
- function get_archive_name($timestamp)
- {
- return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
- self::archive_base_name.$timestamp;
- }
-
}
?>
diff --git a/lib/archive_bundle_iterators/archive_bundle_iterator.php b/lib/archive_bundle_iterators/archive_bundle_iterator.php
index d1a5bf740..7a965f666 100644
--- a/lib/archive_bundle_iterators/archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/archive_bundle_iterator.php
@@ -48,8 +48,6 @@ require_once BASE_DIR."/lib/crawl_constants.php";
*/
abstract class ArchiveBundleIterator implements CrawlConstants
{
-
-
/**
* Timestamp of the archive that is being iterated over
* @var int
@@ -68,6 +66,25 @@ abstract class ArchiveBundleIterator implements CrawlConstants
*/
var $end_of_iterator;
+ /**
+ * The fetcher prefix associated with this archive.
+ * @var string
+ */
+ var $fetcher_prefix;
+
+ /**
+ * Returns the path to an archive given its timestamp.
+ *
+ * @param string $timestamp the archive timestamp
+ * @return string the path to the archive, based off of the fetcher prefix
+ * used when this iterator was constructed
+ */
+ function get_archive_name($timestamp)
+ {
+ return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
+ self::archive_base_name.$timestamp;
+ }
+
/**
* Estimates the important of the site according to the weighting of
* the particular archive iterator
diff --git a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
index d1ff02d59..fbbd165da 100644
--- a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
@@ -50,7 +50,8 @@ require_once BASE_DIR.
* @subpackage iterator
* @see WebArchiveBundle
*/
-class MediaWikiArchiveBundleIterator implements CrawlConstants
+class MediaWikiArchiveBundleIterator extends ArchiveBundleIterator
+ implements CrawlConstants
{
/**
* The number of arc files in this arc archive bundle
@@ -98,11 +99,6 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
* @var resource
*/
var $fh;
- /**
- * The fetcher prefix associated with this archive.
- * @var string
- */
- var $fetcher_prefix;
/**
* Start state of FSA for lexing media wiki docs
@@ -602,18 +598,5 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants
return array($token, $state, $pos);
}
- /**
- * Returns the path to an archive given its timestamp.
- *
- * @param string $timestamp the archive timestamp
- * @return string the path to the archive, based off of the fetcher prefix
- * used when this iterator was constructed
- */
- function get_archive_name($timestamp)
- {
- return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
- self::archive_base_name.$timestamp;
- }
-
}
?>
diff --git a/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php b/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
index 3054b3da9..75d74e290 100644
--- a/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php
@@ -50,7 +50,8 @@ require_once BASE_DIR.
* @subpackage iterator
* @see WebArchiveBundle
*/
-class OdpRdfArchiveBundleIterator implements CrawlConstants
+class OdpRdfArchiveBundleIterator extends ArchiveBundleIterator
+ implements CrawlConstants
{
/**
* The number of arc files in this arc archive bundle
@@ -91,11 +92,6 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants
* @var resource
*/
var $fh;
- /**
- * The fetcher prefix associated with this archive.
- * @var string
- */
- var $fetcher_prefix;
/**
* How many bytes to read into buffer from bz2 stream in one go
@@ -476,17 +472,5 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants
return $html;
}
- /**
- * Returns the path to an archive given its timestamp.
- *
- * @param string $timestamp the archive timestamp
- * @return string the path to the archive, based off of the fetcher prefix
- * used when this iterator was constructed
- */
- function get_archive_name($timestamp)
- {
- return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
- self::archive_base_name.$timestamp;
- }
}
?>
diff --git a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
index ec3ab280d..6479b99de 100644
--- a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
@@ -49,7 +49,8 @@ require_once BASE_DIR.
* @subpackage iterator
* @see WebArchiveBundle
*/
-class WebArchiveBundleIterator implements CrawlConstants
+class WebArchiveBundleIterator extends ArchiveBundleIterator
+ implements CrawlConstants
{
/**
@@ -88,11 +89,6 @@ class WebArchiveBundleIterator implements CrawlConstants
* @var object
*/
var $archive;
- /**
- * The fetcher prefix associated with this archive.
- * @var string
- */
- var $fetcher_prefix;
/**
* Creates a web archive iterator with the given parameters.
@@ -206,19 +202,5 @@ class WebArchiveBundleIterator implements CrawlConstants
$archive_name = $this->get_archive_name($this->result_timestamp);
@unlink("$archive_name/iterate_status.txt");
}
-
- /**
- * Returns the path to an archive given its timestamp.
- *
- * @param string $timestamp the archive timestamp
- * @return string the path to the archive, based off of the fetcher prefix
- * used when this iterator was constructed
- */
- function get_archive_name($timestamp)
- {
- return CRAWL_DIR.'/cache/'.$this->fetcher_prefix.
- self::archive_base_name.$timestamp;
- }
-
}
?>