diff --git a/bin/arc_tool.php b/bin/arc_tool.php index faa0d561f..fe673bcf6 100755 --- a/bin/arc_tool.php +++ b/bin/arc_tool.php @@ -169,7 +169,7 @@ class ArcTool implements CrawlConstants */ function outputArchiveList() { - $pattern = CRAWL_DIR."/cache/{".self::archive_base_name.",". + $pattern = CRAWL_DIR."/cache/*-{".self::archive_base_name.",". self::index_data_base_name."}*"; $archives = glob($pattern, GLOB_BRACE); diff --git a/bin/fetcher.php b/bin/fetcher.php index edaab2418..0bc182572 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -355,11 +355,10 @@ class Fetcher implements CrawlConstants CrawlDaemon::init($argv, "fetcher"); if(isset($argv[2]) ) { $this->fetcher_num = intval($argv[2]); - crawlLog("\n\nInitialize logger..", $this->fetcher_num."-fetcher"); - } else { - crawlLog("\n\nInitialize logger..", "fetcher"); + $this->fetcher_num = 0; } + crawlLog("\n\nInitialize logger..", $this->fetcher_num."-fetcher"); $this->loop(); } @@ -377,10 +376,7 @@ class Fetcher implements CrawlConstants { crawlLog("In Fetch Loop"); - $prefix = ""; - if($this->fetcher_num !== false) { - $prefix = $this->fetcher_num."-"; - } + $prefix = $this->fetcher_num."-"; if(!file_exists(CRAWL_DIR."/{$prefix}temp")) { mkdir(CRAWL_DIR."/{$prefix}temp"); } @@ -512,10 +508,7 @@ class Fetcher implements CrawlConstants return array(); } - $prefix = ""; - if($this->fetcher_num !== false) { - $prefix = $this->fetcher_num."-"; - } + $prefix = $this->fetcher_num."-"; $tmp_dir = CRAWL_DIR."/{$prefix}temp"; $site_pages = FetchUrl::getPages($sites, true, $this->page_range_request, $tmp_dir @@ -552,11 +545,8 @@ class Fetcher implements CrawlConstants */ function downloadPagesArchiveCrawl() { - $prefix = ""; - if($this->fetcher_num !== false) { - $prefix = $this->fetcher_num."-"; - } - $base_name = CRAWL_DIR.'/cache/{$prefix}'.self::archive_base_name. + $prefix = $this->fetcher_num."-"; + $base_name = CRAWL_DIR."/cache/{$prefix}".self::archive_base_name. $this->crawl_index; $pages = array(); if(!isset($this->archive_iterator->iterate_timestamp) || @@ -575,7 +565,8 @@ class Fetcher implements CrawlConstants } $iterator_name = $arctype."Iterator"; $this->archive_iterator = - new $iterator_name($this->crawl_index, $this->crawl_time); + new $iterator_name($prefix, $this->crawl_index, + $this->crawl_time); if($this->archive_iterator == NULL) { crawlLog("Error creating archive iterator!!"); return $pages; @@ -597,10 +588,7 @@ class Fetcher implements CrawlConstants */ function deleteOldCrawls(&$still_active_crawls) { - $prefix = ""; - if($this->fetcher_num !== false) { - $prefix = $this->fetcher_num."-"; - } + $prefix = $this->fetcher_num."-"; $dirs = glob(CRAWL_DIR.'/cache/*', GLOB_ONLYDIR); $full_base_name = $prefix . self::archive_base_name; @@ -653,10 +641,7 @@ class Fetcher implements CrawlConstants $time = time(); $session = md5($time . AUTH_KEY); - $prefix = ""; - if($this->fetcher_num !== false) { - $prefix = $this->fetcher_num."-"; - } + $prefix = $this->fetcher_num."-"; /* if just restarted, check to make sure the crawl hasn't changed, if it has bail @@ -742,10 +727,7 @@ class Fetcher implements CrawlConstants */ function checkScheduler() { - $prefix = ""; - if($this->fetcher_num !== false) { - $prefix = $this->fetcher_num."-"; - } + $prefix = $this->fetcher_num."-"; $info = array(); if((count($this->to_crawl) > 0 || count($this->to_crawl_again) > 0) && @@ -987,10 +969,7 @@ class Fetcher implements CrawlConstants crawlLog(" Start process pages..."); $start_time = microtime(); - $prefix = ""; - if($this->fetcher_num !== false) { - $prefix = $this->fetcher_num."-"; - } + $prefix = $this->fetcher_num."-"; $stored_site_pages = array(); $summarized_site_pages = array(); @@ -1573,10 +1552,7 @@ class Fetcher implements CrawlConstants { $queue_server = $this->queue_servers[$this->current_server]; - $prefix = ""; - if($this->fetcher_num !== false) { - $prefix = $this->fetcher_num."-"; - } + $prefix = $this->fetcher_num."-"; if(count($this->to_crawl) <= 0) { $schedule_time = $this->schedule_time; diff --git a/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php b/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php index 7120991d4..9739fb9ff 100644 --- a/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php +++ b/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php @@ -80,6 +80,11 @@ class ArcArchiveBundleIterator implements CrawlConstants * @var resource */ var $fh; + /** + * The fetcher prefix associated with this archive. + * @var string + */ + var $fetcher_prefix; /** * Creates a arc archive iterator with the given parameters. @@ -89,12 +94,12 @@ class ArcArchiveBundleIterator implements CrawlConstants * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in */ - function __construct($iterate_timestamp, $result_timestamp) + function __construct($prefix, $iterate_timestamp, $result_timestamp) { + $this->fetcher_prefix = $prefix; $this->iterate_timestamp = $iterate_timestamp; $this->result_timestamp = $result_timestamp; - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $iterate_timestamp; + $archive_name = $this->get_archive_name($iterate_timestamp); $this->partitions = array(); foreach(glob("$archive_name/*.arc.gz") as $filename) { $this->partitions[] = $filename; @@ -113,9 +118,6 @@ class ArcArchiveBundleIterator implements CrawlConstants } else { $this->reset(); } - - - } /** @@ -139,8 +141,7 @@ class ArcArchiveBundleIterator implements CrawlConstants $this->end_of_iterator = false; $this->current_offset = 0; $this->fh = NULL; - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $this->result_timestamp; + $archive_name = $this->get_archive_name($this->result_timestamp); @unlink("$archive_name/iterate_status.txt"); } @@ -176,8 +177,7 @@ class ArcArchiveBundleIterator implements CrawlConstants $this->current_offset = gztell($this->fh); } - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $this->result_timestamp; + $archive_name = $this->get_archive_name($this->result_timestamp); $info = array(); $info['end_of_iterator'] = $this->end_of_iterator; $info['current_partition_num'] = $this->current_partition_num; @@ -217,5 +217,18 @@ class ArcArchiveBundleIterator implements CrawlConstants return $site; } + /** + * Returns the path to an archive given its timestamp. + * + * @param string $timestamp the archive timestamp + * @return string the path to the archive, based off of the fetcher prefix + * used when this iterator was constructed + */ + function get_archive_name($timestamp) + { + return CRAWL_DIR.'/cache/'.$this->fetcher_prefix. + self::archive_base_name.$timestamp; + } + } ?> diff --git a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php index 7e965924d..d1ff02d59 100644 --- a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php +++ b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php @@ -98,6 +98,11 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants * @var resource */ var $fh; + /** + * The fetcher prefix associated with this archive. + * @var string + */ + var $fetcher_prefix; /** * Start state of FSA for lexing media wiki docs @@ -140,12 +145,12 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in */ - function __construct($iterate_timestamp, $result_timestamp) + function __construct($prefix, $iterate_timestamp, $result_timestamp) { + $this->fetcher_prefix = $prefix; $this->iterate_timestamp = $iterate_timestamp; $this->result_timestamp = $result_timestamp; - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $iterate_timestamp; + $archive_name = $this->get_archive_name($iterate_timestamp); $this->partitions = array(); foreach(glob("$archive_name/*.xml.bz2") as $filename) { $this->partitions[] = $filename; @@ -262,8 +267,7 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants $this->current_offset = 0; $this->fh = NULL; $this->buffer = ""; - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $this->result_timestamp; + $archive_name = $this->get_archive_name($this->result_timestamp); @unlink("$archive_name/iterate_status.txt"); } @@ -320,8 +324,7 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants $this->current_page_num += $page_count; } - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $this->result_timestamp; + $archive_name = $this->get_archive_name($this->result_timestamp); $info = array(); $info['end_of_iterator'] = $this->end_of_iterator; $info['current_partition_num'] = $this->current_partition_num; @@ -598,5 +601,19 @@ class MediaWikiArchiveBundleIterator implements CrawlConstants } while($continue && $pos < $len); return array($token, $state, $pos); } + + /** + * Returns the path to an archive given its timestamp. + * + * @param string $timestamp the archive timestamp + * @return string the path to the archive, based off of the fetcher prefix + * used when this iterator was constructed + */ + function get_archive_name($timestamp) + { + return CRAWL_DIR.'/cache/'.$this->fetcher_prefix. + self::archive_base_name.$timestamp; + } + } ?> diff --git a/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php b/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php index f822df2fe..3054b3da9 100644 --- a/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php +++ b/lib/archive_bundle_iterators/odp_rdf_bundle_iterator.php @@ -91,10 +91,17 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants * @var resource */ var $fh; + /** + * The fetcher prefix associated with this archive. + * @var string + */ + var $fetcher_prefix; + /** * How many bytes to read into buffer from bz2 stream in one go */ const BLOCK_SIZE = 8192; + /** * Creates an open directory rdf archive iterator with the given parameters. * @@ -103,12 +110,12 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in */ - function __construct($iterate_timestamp, $result_timestamp) + function __construct($prefix, $iterate_timestamp, $result_timestamp) { + $this->fetcher_prefix = $prefix; $this->iterate_timestamp = $iterate_timestamp; $this->result_timestamp = $result_timestamp; - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $iterate_timestamp; + $archive_name = $this->get_archive_name($iterate_timestamp); $this->partitions = array(); foreach(glob("$archive_name/*.gz") as $filename) { $this->partitions[] = $filename; @@ -263,8 +270,7 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants $this->current_offset = 0; $this->fh = NULL; $this->buffer = ""; - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $this->result_timestamp; + $archive_name = $this->get_archive_name($this->result_timestamp); @unlink("$archive_name/iterate_status.txt"); } @@ -316,8 +322,7 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants $this->current_page_num += $page_count; } - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $this->result_timestamp; + $archive_name = $this->get_archive_name($this->result_timestamp); $info = array(); $info['end_of_iterator'] = $this->end_of_iterator; $info['current_partition_num'] = $this->current_partition_num; @@ -470,5 +475,18 @@ class OdpRdfArchiveBundleIterator implements CrawlConstants } return $html; } + + /** + * Returns the path to an archive given its timestamp. + * + * @param string $timestamp the archive timestamp + * @return string the path to the archive, based off of the fetcher prefix + * used when this iterator was constructed + */ + function get_archive_name($timestamp) + { + return CRAWL_DIR.'/cache/'.$this->fetcher_prefix. + self::archive_base_name.$timestamp; + } } ?> diff --git a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php index c8ea2505d..ec3ab280d 100644 --- a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php +++ b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php @@ -88,24 +88,29 @@ class WebArchiveBundleIterator implements CrawlConstants * @var object */ var $archive; + /** + * The fetcher prefix associated with this archive. + * @var string + */ + var $fetcher_prefix; /** * Creates a web archive iterator with the given parameters. * + * @param string $prefix fetcher number this bundle is associated with * @param string $iterate_timestamp timestamp of the web archive bundle to * iterate over the pages of * @param string $result_timestamp timestamp of the web archive bundle * results are being stored in */ - function __construct($iterate_timestamp, $result_timestamp) + function __construct($prefix, $iterate_timestamp, $result_timestamp) { + $this->fetcher_prefix = $prefix; $this->iterate_timestamp = $iterate_timestamp; $this->result_timestamp = $result_timestamp; - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $iterate_timestamp; + $archive_name = $this->get_archive_name($iterate_timestamp); $this->archive = new WebArchiveBundle($archive_name); - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $result_timestamp; + $archive_name = $this->get_archive_name($result_timestamp); if(file_exists("$archive_name/iterate_status.txt")) { $info = unserialize(file_get_contents( "$archive_name/iterate_status.txt")); @@ -170,8 +175,7 @@ class WebArchiveBundleIterator implements CrawlConstants $this->end_of_iterator = ($this->overall_index >= $this->count ) ? true : false; - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $this->result_timestamp; + $archive_name = $this->get_archive_name($this->result_timestamp); $info = array(); $info['overall_index'] = $this->overall_index; $info['end_of_iterator'] = $this->end_of_iterator; @@ -199,10 +203,22 @@ class WebArchiveBundleIterator implements CrawlConstants $this->partition = $this->archive->getPartition( $this->current_partition_num, false); $this->partition->reset(); - $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. - $this->result_timestamp; + $archive_name = $this->get_archive_name($this->result_timestamp); @unlink("$archive_name/iterate_status.txt"); } + /** + * Returns the path to an archive given its timestamp. + * + * @param string $timestamp the archive timestamp + * @return string the path to the archive, based off of the fetcher prefix + * used when this iterator was constructed + */ + function get_archive_name($timestamp) + { + return CRAWL_DIR.'/cache/'.$this->fetcher_prefix. + self::archive_base_name.$timestamp; + } + } ?>