Fixes a bug in charset parsing, a=chris
Fixes a bug in charset parsing, a=chris
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 3d0ee70be..4ee03ab73 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -937,7 +937,7 @@ class Fetcher implements CrawlConstants
}
crawlLog(" Converting from encoding ".
$site[self::ENCODING]."...");
- $site[self::PAGE] = mb_convert_encoding($site[self::PAGE],
+ $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE],
"UTF-8", $site[self::ENCODING]);
}
crawlLog(" Using Processor...".$page_processor);
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index 3f938b286..b7c144741 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -308,10 +308,18 @@ class FetchUrl implements CrawlConstants
if($end_head) {
$len_c = strlen("charset=");
$start_charset = stripos($site[$value],
- "charset=") + $len_c;
- if($start_charset && $start_charset < $end_head) {
+ "charset=");
+ if($start_charset && $start_charset + $len_c < $end_head) {
+ $start_charset += $len_c;
$end_charset = stripos($site[$value],
'"', $start_charset);
+ $end_charset2 = false;
+ if(!$end_charset) {
+ $end_charset2 = stripos($site[$value], "'", $start_charset);
+ }
+ if($end_charset && $end_charset2) {
+ $end_charset = min($end_charset, $end_charset2);
+ }
if($end_charset && $end_charset < $end_head) {
$pre_charset = substr($site[$value],
$start_charset, $end_charset - $start_charset);
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 7ccd09b9b..cb5e06ebb 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -315,12 +315,11 @@ class GroupIterator extends IndexBundleIterator
function groupByHashAndAggregate(&$pre_out_pages)
{
foreach($pre_out_pages as $hash_url => $data) {
- $hash = substr($data[0]['KEY'], IndexShard::DOC_KEY_LEN,
- IndexShard::DOC_KEY_LEN);
- if(!$data[0][self::IS_DOC] ||
- crawlHash($hash_url. "LOCATION", true) == $hash) {
+ $hash = $pre_out_pages[$hash_url][0][self::HASH];
+ $is_location = (crawlHash($hash_url. "LOCATION", true) == $hash);
+ if(!$data[0][self::IS_DOC] || $is_location) {
$item = $this->lookupDoc($data[0]['KEY'],
- $data[0][self::IS_DOC]);
+ $is_location);
if($item != false) {
array_unshift($pre_out_pages[$hash_url], $item);
}
@@ -380,15 +379,19 @@ class GroupIterator extends IndexBundleIterator
$keys = array_keys($doc_array);
$key = $keys[0];
$item = $doc_array[$key];
- if(!$item[self::IS_DOC] && $is_location) {
- return $this->lookupDoc($key);
+ $hash = substr($key, IndexShard::DOC_KEY_LEN,
+ IndexShard::DOC_KEY_LEN);
+ $is2_location = (crawlHash($hash_url. "LOCATION", true) == $hash);
+ if($is2_location) {
+ return $this->lookupDoc($key, $is2_location);
+ } else if(!isset($item[self::IS_DOC]) || !$item[self::IS_DOC]) {
+ return $this->lookupDoc($key, false);
}
$item[self::RELEVANCE] = $relevance;
$item[self::SCORE] = $item[self::DOC_RANK]*pow(1.1, $relevance);
$item['KEY'] = $key;
$item['INDEX'] = $word_iterator->index;
- $item[self::HASH] = substr($key,
- IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+ $item[self::HASH] = $hash;
$item[self::INLINKS] = substr($key,
2*IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
}