Clean up mediawiki archive bundle iterator code, a=chris

Chris Pollett [2013-03-25 16:Mar:th]
Clean up mediawiki archive bundle iterator code, a=chris
Filename
lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
diff --git a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
index d81cf09d9..635af5193 100644
--- a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
@@ -89,15 +89,6 @@ EOD
 class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator
     implements CrawlConstants
 {
-
-   /**
-    * Used to store the hash of the last page processed. If see hash
-    * assume there was a problem with processing all the regexes in the
-    * last nextPage call and so this time do less regexes.
-    * @var string
-    */
-    var $last_hash = "";
-
     /**
      * Creates a media wiki archive iterator with the given parameters.
      *
@@ -315,29 +306,12 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator
     function restoreCheckPoint()
     {
         $info = parent::restoreCheckPoint();
-        if(isset($info["last_hash"])) {
-            $this->last_hash = $info["last_hash"];
-        }
         if(!$this->iterate_dir) { // do on client not name server
             $this->initializeSubstitutions();
         }
         return $info;
     }

-    /**
-     * Stores the current progress to the file iterate_status.txt in the result
-     * dir such that a new instance of the iterator could be constructed and
-     * return the next set of pages without having to process all of the pages
-     * that came before. Each iterator should make a call to saveCheckpoint
-     * after extracting a batch of pages.
-     * @param array $info any extra info a subclass wants to save
-     */
-    function saveCheckPoint($info=array())
-    {
-        $info["last_hash"] = $this->last_hash;
-        parent::saveCheckPoint($info);
-    }
-
     /**
      * Gets the text content of the first dom node satisfying the
      * xpath expression $path in the dom document $dom
@@ -397,28 +371,22 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator
             "<body><h1>$pre_url</h1>\n";
         $pre_page = $this->getTextContent($dom, "/page/revision/text");
         $current_hash = crawlHash($pre_page);
-  /*      if($this->last_hash == $current_hash) {
-            $minimal_regexes = true;
-        }
-        $this->last_hash = $current_hash;*/
         if($first_call) {
             $this->saveCheckPoint(); //ensure we remember to advance one on fail
             $first_call = false;
         }
         $toc = $this->makeTableOfContents($pre_page);
-        if(!$minimal_regexes) {
-            list($pre_page, $references) = $this->makeReferences($pre_page);
-            $pre_page = preg_replace_callback('/(\A|\n){\|(.*?)\n\|}/s',
-                "makeTableCallback", $pre_page);
-            $pre_page = preg_replace($this->matches, $this->replaces,$pre_page);
-            $pre_page = preg_replace("/{{Other uses}}/i",
-                    "<div class='indent'>\"$1\". (<a href='".
-                    $site[self::URL]. "_(disambiguation)'>$pre_url</a>)</div>",
-                    $pre_page);
-            $pre_page = preg_replace_callback("/((href=)\"([^\"]+)\")/",
-                "fixLinksCallback", $pre_page);
-            $pre_page = $this->insertReferences($pre_page, $references);
-        }
+        list($pre_page, $references) = $this->makeReferences($pre_page);
+        $pre_page = preg_replace_callback('/(\A|\n){\|(.*?)\n\|}/s',
+            "makeTableCallback", $pre_page);
+        $pre_page = preg_replace($this->matches, $this->replaces,$pre_page);
+        $pre_page = preg_replace("/{{Other uses}}/i",
+                "<div class='indent'>\"$1\". (<a href='".
+                $site[self::URL]. "_(disambiguation)'>$pre_url</a>)</div>",
+                $pre_page);
+        $pre_page = preg_replace_callback("/((href=)\"([^\"]+)\")/",
+            "fixLinksCallback", $pre_page);
+        $pre_page = $this->insertReferences($pre_page, $references);
         $pre_page = $this->insertTableOfContents($pre_page, $toc);
         $site[self::PAGE] .= $pre_page;
         $site[self::PAGE] .= "\n</body>\n</html>";
ViewGit