Fix a bug with javacript appearing in extracted words where not wanted, a=chris

Chris Pollett [2011-01-31 07:Jan:st]
Fix a bug with javacript appearing in extracted words where not wanted, a=chris
Filename
lib/index_archive_bundle.php
lib/processors/html_processor.php
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 48498ce3b..01ca8f7b9 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -281,7 +281,8 @@ class IndexArchiveBundle implements CrawlConstants
             $current_index_shard_file = $this->dir_name."/index".
                 $this->generation_info['CURRENT'];
             $this->current_shard = new IndexShard($current_index_shard_file,
-                $this->generation_info['CURRENT'], $num_docs_per_generation);
+                $this->generation_info['CURRENT'],
+                $this->num_docs_per_generation);
         }
         return $this->current_shard;
      }
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index b32ab5a42..9389e3aba 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -70,6 +70,7 @@ class HtmlProcessor extends TextProcessor
     {
         $summary = NULL;
         if(is_string($page)) {
+            $page = preg_replace('@<script[^>]*?.*?</script>@siu', ' ', $page);
             $dom = self::dom($page);
             if($dom !== false && self::checkMetaRobots($dom)) {
                 $summary[self::TITLE] = self::title($dom);
ViewGit