Clean up mediawiki archive bundle iterator code, a=chris
Clean up mediawiki archive bundle iterator code, a=chris
diff --git a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
index d81cf09d9..635af5193 100644
--- a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php
@@ -89,15 +89,6 @@ EOD
class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator
implements CrawlConstants
{
-
- /**
- * Used to store the hash of the last page processed. If see hash
- * assume there was a problem with processing all the regexes in the
- * last nextPage call and so this time do less regexes.
- * @var string
- */
- var $last_hash = "";
-
/**
* Creates a media wiki archive iterator with the given parameters.
*
@@ -315,29 +306,12 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator
function restoreCheckPoint()
{
$info = parent::restoreCheckPoint();
- if(isset($info["last_hash"])) {
- $this->last_hash = $info["last_hash"];
- }
if(!$this->iterate_dir) { // do on client not name server
$this->initializeSubstitutions();
}
return $info;
}
- /**
- * Stores the current progress to the file iterate_status.txt in the result
- * dir such that a new instance of the iterator could be constructed and
- * return the next set of pages without having to process all of the pages
- * that came before. Each iterator should make a call to saveCheckpoint
- * after extracting a batch of pages.
- * @param array $info any extra info a subclass wants to save
- */
- function saveCheckPoint($info=array())
- {
- $info["last_hash"] = $this->last_hash;
- parent::saveCheckPoint($info);
- }
-
/**
* Gets the text content of the first dom node satisfying the
* xpath expression $path in the dom document $dom
@@ -397,28 +371,22 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator
"<body><h1>$pre_url</h1>\n";
$pre_page = $this->getTextContent($dom, "/page/revision/text");
$current_hash = crawlHash($pre_page);
- /* if($this->last_hash == $current_hash) {
- $minimal_regexes = true;
- }
- $this->last_hash = $current_hash;*/
if($first_call) {
$this->saveCheckPoint(); //ensure we remember to advance one on fail
$first_call = false;
}
$toc = $this->makeTableOfContents($pre_page);
- if(!$minimal_regexes) {
- list($pre_page, $references) = $this->makeReferences($pre_page);
- $pre_page = preg_replace_callback('/(\A|\n){\|(.*?)\n\|}/s',
- "makeTableCallback", $pre_page);
- $pre_page = preg_replace($this->matches, $this->replaces,$pre_page);
- $pre_page = preg_replace("/{{Other uses}}/i",
- "<div class='indent'>\"$1\". (<a href='".
- $site[self::URL]. "_(disambiguation)'>$pre_url</a>)</div>",
- $pre_page);
- $pre_page = preg_replace_callback("/((href=)\"([^\"]+)\")/",
- "fixLinksCallback", $pre_page);
- $pre_page = $this->insertReferences($pre_page, $references);
- }
+ list($pre_page, $references) = $this->makeReferences($pre_page);
+ $pre_page = preg_replace_callback('/(\A|\n){\|(.*?)\n\|}/s',
+ "makeTableCallback", $pre_page);
+ $pre_page = preg_replace($this->matches, $this->replaces,$pre_page);
+ $pre_page = preg_replace("/{{Other uses}}/i",
+ "<div class='indent'>\"$1\". (<a href='".
+ $site[self::URL]. "_(disambiguation)'>$pre_url</a>)</div>",
+ $pre_page);
+ $pre_page = preg_replace_callback("/((href=)\"([^\"]+)\")/",
+ "fixLinksCallback", $pre_page);
+ $pre_page = $this->insertReferences($pre_page, $references);
$pre_page = $this->insertTableOfContents($pre_page, $toc);
$site[self::PAGE] .= $pre_page;
$site[self::PAGE] .= "\n</body>\n</html>";