diff --git a/bin/fetcher.php b/bin/fetcher.php index 669a695de..c7918493e 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -573,6 +573,7 @@ class Fetcher implements CrawlConstants crawlLog("New name: ".$this->web_archive->dir_name); crawlLog("Switching archive..."); + continue; } switch($this->crawl_type) @@ -756,7 +757,7 @@ class Fetcher implements CrawlConstants $files = glob(CRAWL_DIR.'/schedules/*'); $names = array(self::fetch_batch_name, self::fetch_crawl_info, self::fetch_closed_name, self::schedule_name, - self::fetch_archive_iterator); + self::fetch_archive_iterator, self::save_point); foreach($files as $file) { $timestamp = ""; foreach($names as $name) { @@ -770,7 +771,11 @@ class Fetcher implements CrawlConstants } if($timestamp !== "" && !in_array($timestamp,$still_active_crawls)){ - unlink($file); + if(is_dir($file)) { + $this->db->unlinkRecursive($file); + } else { + unlink($file); + } } } } @@ -911,11 +916,12 @@ class Fetcher implements CrawlConstants if(general_is_a($this->arc_type."Iterator", "TextArchiveBundleIterator")) { $result_dir = WORK_DIRECTORY . "/schedules/" . - self::fetch_archive_iterator.$this->crawl_time; + $prefix.self::fetch_archive_iterator.$this->crawl_time; $iterator_name = $this->arc_type."Iterator"; $this->archive_iterator = new $iterator_name( $info[self::CRAWL_INDEX], false, $this->crawl_time, $result_dir); + $this->db->setWorldPermissionsRecursive($result_dir); } } crawlLog("End Name Server Check"); @@ -1077,18 +1083,21 @@ class Fetcher implements CrawlConstants // write a copy to disk in case something goes wrong. $pages = unserialize(gzuncompress(webdecode($info[self::DATA]))); if($chunk) { - if(isset($pages[self::INI])) { - $archive_iterator->setIniInfo($pages[self::INI]); - } - if($pages[self::ARC_DATA]) { - $archive_iterator->makeBuffer($pages[self::ARC_DATA]); - } - if(!$pages[self::START_PARTITION]) { - $archive_iterator->nextPages(1); - } - if(isset($pages[self::HEADER]) && is_array( - $pages[self::HEADER]) && $pages[self::HEADER] != array()) { - $archive_iterator->header = $pages[self::HEADER]; + if(isset($pages[self::ARC_DATA]) ) { + if(isset($pages[self::INI])) { + $archive_iterator->setIniInfo($pages[self::INI]); + } + if($pages[self::ARC_DATA]) { + $archive_iterator->makeBuffer($pages[self::ARC_DATA]); + } + if(isset($pages[self::HEADER]) && + is_array($pages[self::HEADER]) && + $pages[self::HEADER] != array()) { + $archive_iterator->header = $pages[self::HEADER]; + } + if(!$pages[self::START_PARTITION]) { + $archive_iterator->nextPages(1); + } } } else { $info[self::ARC_DATA] = $pages; diff --git a/bin/queue_server.php b/bin/queue_server.php index ad9b9009d..96d0559f0 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -1210,7 +1210,8 @@ class QueueServer implements CrawlConstants, Join $names_dir = array(self::schedule_data_base_name, self::index_data_base_name, self::robot_data_base_name, self::name_archive_iterator, self::fetch_archive_iterator); - $name_files = array(self::schedule_name, self::index_closed_name); + $name_files = array(self::schedule_name, self::index_closed_name, + self::save_point); $names = array_merge($name_files, $names_dir); foreach($files as $file) { $timestamp = ""; diff --git a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php index 3b2e37fd7..d81cf09d9 100644 --- a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php +++ b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php @@ -370,7 +370,6 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator if($first_call) { $this->initializeSubstitutions(); } - $page_info = $this->getNextTagData("page"); if($no_process) { return $page_info; } $dom = new DOMDocument(); @@ -398,10 +397,10 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator "<body><h1>$pre_url</h1>\n"; $pre_page = $this->getTextContent($dom, "/page/revision/text"); $current_hash = crawlHash($pre_page); - if($this->last_hash == $current_hash) { + /* if($this->last_hash == $current_hash) { $minimal_regexes = true; } - $this->last_hash = $current_hash; + $this->last_hash = $current_hash;*/ if($first_call) { $this->saveCheckPoint(); //ensure we remember to advance one on fail $first_call = false; diff --git a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php index b5d790c02..d3bd5b134 100644 --- a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php +++ b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php @@ -340,7 +340,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator if($this->checkFileHandle()) { $this->fileClose(); } - if(!$this->iterate_dir) { //network case + if(!$this->iterate_dir) { //fetcher local case break; } $this->current_partition_num++; @@ -350,7 +350,6 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator } $this->fileOpen( $this->partitions[$this->current_partition_num]); - if($this->switch_partition_callback_name != NULL) { $callback_name = $this->switch_partition_callback_name; $result = $this->$callback_name(); diff --git a/models/crawl_model.php b/models/crawl_model.php index 09f50c762..9316f1a83 100755 --- a/models/crawl_model.php +++ b/models/crawl_model.php @@ -721,12 +721,11 @@ EOT; $this->db->unlinkRecursive( CRAWL_DIR.'/schedules/'.self::name_archive_iterator. $timestamp, true); - $this->db->unlinkRecursive( - CRAWL_DIR.'/schedules/'.self::fetch_archive_iterator. - $timestamp, true); - $save_point_file = CRAWL_DIR.'/cache/'.self::save_point. - $timestamp.".txt"; - @unlink($save_point_file); + $save_point_files = glob(CRAWL_DIR.'/schedules/'.self::save_point. + $timestamp."*.txt"); + foreach($save_point_files as $save_point_file) { + @unlink($save_point_file); + } $this->db->selectDB(DB_NAME); $sql = "SELECT DISTINCT MIX_TIMESTAMP FROM MIX_COMPONENTS WHERE ". diff --git a/models/parallel_model.php b/models/parallel_model.php index df69257ab..0f6b1dd64 100755 --- a/models/parallel_model.php +++ b/models/parallel_model.php @@ -448,7 +448,7 @@ class ParallelModel extends Model implements CrawlConstants return; } - $save_files = glob(CRAWL_DIR.'/cache/Savepoint'. + $save_files = glob(CRAWL_DIR.'/schedules/'.self::save_point. $save_timestamp."*.txt"); foreach($save_files as $save_file) { @unlink($save_file); diff --git a/models/phrase_model.php b/models/phrase_model.php index ad57ed8d7..95b61b6ac 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -966,7 +966,7 @@ class PhraseModel extends ParallelModel if($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) { // used for archive crawls of crawl mixes - $save_file = CRAWL_DIR.'/cache/'.self::save_point. + $save_file = CRAWL_DIR.'/schedules/'.self::save_point. $save_timestamp_name.".txt"; $iterators = $query_iterator->save_iterators; $cnt_iterators = count($iterators); @@ -977,6 +977,7 @@ class PhraseModel extends ParallelModel } $results["SAVE_POINT"] = $save_point; file_put_contents($save_file, serialize($save_point)); + $this->db->setWorldPermissionsRecursive($save_file); } $pages = array_values($pages); $result_count = count($pages); @@ -1258,7 +1259,7 @@ class PhraseModel extends ParallelModel $doc_iterate_group_hash = crawlHash("site:doc"); if($save_timestamp_name != "") { // used for archive crawls of crawl mixes - $save_file = CRAWL_DIR.'/cache/'.self::save_point. + $save_file = CRAWL_DIR.'/schedules/'.self::save_point. $save_timestamp_name.".txt"; if(file_exists($save_file)) { $save_point =