Fix a bug with javacript appearing in extracted words where not wanted, a=chris
Fix a bug with javacript appearing in extracted words where not wanted, a=chris
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 48498ce3b..01ca8f7b9 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -281,7 +281,8 @@ class IndexArchiveBundle implements CrawlConstants
$current_index_shard_file = $this->dir_name."/index".
$this->generation_info['CURRENT'];
$this->current_shard = new IndexShard($current_index_shard_file,
- $this->generation_info['CURRENT'], $num_docs_per_generation);
+ $this->generation_info['CURRENT'],
+ $this->num_docs_per_generation);
}
return $this->current_shard;
}
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index b32ab5a42..9389e3aba 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -70,6 +70,7 @@ class HtmlProcessor extends TextProcessor
{
$summary = NULL;
if(is_string($page)) {
+ $page = preg_replace('@<script[^>]*?.*?</script>@siu', ' ', $page);
$dom = self::dom($page);
if($dom !== false && self::checkMetaRobots($dom)) {
$summary[self::TITLE] = self::title($dom);