Bug fixes to chunk manipulation in archive iterators, a=chris

Chris Pollett [2013-03-25 15:Mar:th]
Bug fixes to chunk manipulation in archive iterators, a=chris
Filename
bin/fetcher.php
bin/queue_server.php
lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
lib/archive_bundle_iterators/text_archive_bundle_iterator.php
models/crawl_model.php
models/parallel_model.php
models/phrase_model.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 669a695de..c7918493e 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -573,6 +573,7 @@ class Fetcher implements CrawlConstants

                 crawlLog("New name: ".$this->web_archive->dir_name);
                 crawlLog("Switching archive...");
+                continue;
             }

             switch($this->crawl_type)
@@ -756,7 +757,7 @@ class Fetcher implements CrawlConstants
         $files = glob(CRAWL_DIR.'/schedules/*');
         $names = array(self::fetch_batch_name, self::fetch_crawl_info,
             self::fetch_closed_name, self::schedule_name,
-            self::fetch_archive_iterator);
+            self::fetch_archive_iterator, self::save_point);
         foreach($files as $file) {
             $timestamp = "";
             foreach($names as $name) {
@@ -770,7 +771,11 @@ class Fetcher implements CrawlConstants
             }

             if($timestamp !== "" && !in_array($timestamp,$still_active_crawls)){
-                unlink($file);
+                if(is_dir($file)) {
+                    $this->db->unlinkRecursive($file);
+                } else {
+                    unlink($file);
+                }
             }
         }
     }
@@ -911,11 +916,12 @@ class Fetcher implements CrawlConstants
             if(general_is_a($this->arc_type."Iterator",
                     "TextArchiveBundleIterator")) {
                 $result_dir = WORK_DIRECTORY . "/schedules/" .
-                    self::fetch_archive_iterator.$this->crawl_time;
+                    $prefix.self::fetch_archive_iterator.$this->crawl_time;
                 $iterator_name = $this->arc_type."Iterator";
                 $this->archive_iterator = new $iterator_name(
                     $info[self::CRAWL_INDEX],
                     false, $this->crawl_time, $result_dir);
+                $this->db->setWorldPermissionsRecursive($result_dir);
             }
         }
         crawlLog("End Name Server Check");
@@ -1077,18 +1083,21 @@ class Fetcher implements CrawlConstants
             // write a copy to disk in case something goes wrong.
             $pages = unserialize(gzuncompress(webdecode($info[self::DATA])));
             if($chunk) {
-                if(isset($pages[self::INI])) {
-                    $archive_iterator->setIniInfo($pages[self::INI]);
-                }
-                if($pages[self::ARC_DATA]) {
-                    $archive_iterator->makeBuffer($pages[self::ARC_DATA]);
-                }
-                if(!$pages[self::START_PARTITION]) {
-                    $archive_iterator->nextPages(1);
-                }
-                if(isset($pages[self::HEADER]) && is_array(
-                    $pages[self::HEADER]) && $pages[self::HEADER] != array()) {
-                    $archive_iterator->header = $pages[self::HEADER];
+                if(isset($pages[self::ARC_DATA]) ) {
+                    if(isset($pages[self::INI])) {
+                        $archive_iterator->setIniInfo($pages[self::INI]);
+                    }
+                    if($pages[self::ARC_DATA]) {
+                        $archive_iterator->makeBuffer($pages[self::ARC_DATA]);
+                    }
+                    if(isset($pages[self::HEADER]) &&
+                        is_array($pages[self::HEADER]) &&
+                        $pages[self::HEADER] != array()) {
+                        $archive_iterator->header = $pages[self::HEADER];
+                    }
+                    if(!$pages[self::START_PARTITION]) {
+                        $archive_iterator->nextPages(1);
+                    }
                 }
             } else {
                 $info[self::ARC_DATA] = $pages;
diff --git a/bin/queue_server.php b/bin/queue_server.php
index ad9b9009d..96d0559f0 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -1210,7 +1210,8 @@ class QueueServer implements CrawlConstants, Join
         $names_dir = array(self::schedule_data_base_name,
             self::index_data_base_name, self::robot_data_base_name,
             self::name_archive_iterator, self::fetch_archive_iterator);
-        $name_files = array(self::schedule_name, self::index_closed_name);
+        $name_files = array(self::schedule_name, self::index_closed_name,
+            self::save_point);
         $names = array_merge($name_files, $names_dir);
         foreach($files as $file) {
             $timestamp = "";
diff --git a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
index 3b2e37fd7..d81cf09d9 100644
--- a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
@@ -370,7 +370,6 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator
         if($first_call) {
             $this->initializeSubstitutions();
         }
-
         $page_info = $this->getNextTagData("page");
         if($no_process) { return $page_info; }
         $dom = new DOMDocument();
@@ -398,10 +397,10 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator
             "<body><h1>$pre_url</h1>\n";
         $pre_page = $this->getTextContent($dom, "/page/revision/text");
         $current_hash = crawlHash($pre_page);
-        if($this->last_hash == $current_hash) {
+  /*      if($this->last_hash == $current_hash) {
             $minimal_regexes = true;
         }
-        $this->last_hash = $current_hash;
+        $this->last_hash = $current_hash;*/
         if($first_call) {
             $this->saveCheckPoint(); //ensure we remember to advance one on fail
             $first_call = false;
diff --git a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
index b5d790c02..d3bd5b134 100644
--- a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
@@ -340,7 +340,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
                 if($this->checkFileHandle()) {
                     $this->fileClose();
                 }
-                if(!$this->iterate_dir) { //network case
+                if(!$this->iterate_dir) { //fetcher local case
                     break;
                 }
                 $this->current_partition_num++;
@@ -350,7 +350,6 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
                 }
                 $this->fileOpen(
                     $this->partitions[$this->current_partition_num]);
-
                 if($this->switch_partition_callback_name != NULL) {
                     $callback_name = $this->switch_partition_callback_name;
                     $result = $this->$callback_name();
diff --git a/models/crawl_model.php b/models/crawl_model.php
index 09f50c762..9316f1a83 100755
--- a/models/crawl_model.php
+++ b/models/crawl_model.php
@@ -721,12 +721,11 @@ EOT;
         $this->db->unlinkRecursive(
             CRAWL_DIR.'/schedules/'.self::name_archive_iterator.
             $timestamp, true);
-        $this->db->unlinkRecursive(
-            CRAWL_DIR.'/schedules/'.self::fetch_archive_iterator.
-            $timestamp, true);
-        $save_point_file = CRAWL_DIR.'/cache/'.self::save_point.
-            $timestamp.".txt";
-        @unlink($save_point_file);
+        $save_point_files = glob(CRAWL_DIR.'/schedules/'.self::save_point.
+            $timestamp."*.txt");
+        foreach($save_point_files as $save_point_file) {
+            @unlink($save_point_file);
+        }

         $this->db->selectDB(DB_NAME);
         $sql = "SELECT DISTINCT MIX_TIMESTAMP FROM MIX_COMPONENTS WHERE ".
diff --git a/models/parallel_model.php b/models/parallel_model.php
index df69257ab..0f6b1dd64 100755
--- a/models/parallel_model.php
+++ b/models/parallel_model.php
@@ -448,7 +448,7 @@ class ParallelModel extends Model implements CrawlConstants
             return;
         }

-        $save_files = glob(CRAWL_DIR.'/cache/Savepoint'.
+        $save_files = glob(CRAWL_DIR.'/schedules/'.self::save_point.
             $save_timestamp."*.txt");
         foreach($save_files as $save_file) {
             @unlink($save_file);
diff --git a/models/phrase_model.php b/models/phrase_model.php
index ad57ed8d7..95b61b6ac 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -966,7 +966,7 @@ class PhraseModel extends ParallelModel
         if($save_timestamp_name != "" && ($queue_servers == array() ||
             $this->isSingleLocalhost($queue_servers))) {
             // used for archive crawls of crawl mixes
-            $save_file = CRAWL_DIR.'/cache/'.self::save_point.
+            $save_file = CRAWL_DIR.'/schedules/'.self::save_point.
                 $save_timestamp_name.".txt";
             $iterators = $query_iterator->save_iterators;
             $cnt_iterators = count($iterators);
@@ -977,6 +977,7 @@ class PhraseModel extends ParallelModel
             }
             $results["SAVE_POINT"] = $save_point;
             file_put_contents($save_file, serialize($save_point));
+            $this->db->setWorldPermissionsRecursive($save_file);
         }
         $pages = array_values($pages);
         $result_count = count($pages);
@@ -1258,7 +1259,7 @@ class PhraseModel extends ParallelModel
             $doc_iterate_group_hash = crawlHash("site:doc");
             if($save_timestamp_name != "") {
                 // used for archive crawls of crawl mixes
-                $save_file = CRAWL_DIR.'/cache/'.self::save_point.
+                $save_file = CRAWL_DIR.'/schedules/'.self::save_point.
                     $save_timestamp_name.".txt";
                 if(file_exists($save_file)) {
                     $save_point =
ViewGit