Fixes a bug in charset parsing, a=chris

Chris Pollett [2011-10-27 20:Oct:th]
Fixes a bug in charset parsing, a=chris
Filename
bin/fetcher.php
lib/fetch_url.php
lib/index_bundle_iterators/group_iterator.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 3d0ee70be..4ee03ab73 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -937,7 +937,7 @@ class Fetcher implements CrawlConstants
                     }
                     crawlLog("  Converting from encoding ".
                         $site[self::ENCODING]."...");
-                    $site[self::PAGE] = mb_convert_encoding($site[self::PAGE],
+                    $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE],
                         "UTF-8", $site[self::ENCODING]);
                 }
                 crawlLog("  Using Processor...".$page_processor);
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index 3f938b286..b7c144741 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -308,10 +308,18 @@ class FetchUrl implements CrawlConstants
         if($end_head) {
             $len_c = strlen("charset=");
             $start_charset = stripos($site[$value],
-                "charset=") + $len_c;
-            if($start_charset && $start_charset < $end_head) {
+                "charset=");
+            if($start_charset && $start_charset + $len_c < $end_head) {
+                $start_charset += $len_c;
                 $end_charset = stripos($site[$value],
                     '"', $start_charset);
+                $end_charset2 = false;
+                if(!$end_charset) {
+                    $end_charset2 = stripos($site[$value], "'", $start_charset);
+                }
+                if($end_charset && $end_charset2) {
+                    $end_charset = min($end_charset, $end_charset2);
+                }
                 if($end_charset && $end_charset < $end_head) {
                     $pre_charset = substr($site[$value],
                         $start_charset, $end_charset - $start_charset);
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 7ccd09b9b..cb5e06ebb 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -315,12 +315,11 @@ class GroupIterator extends IndexBundleIterator
     function groupByHashAndAggregate(&$pre_out_pages)
     {
         foreach($pre_out_pages as $hash_url => $data) {
-            $hash = substr($data[0]['KEY'], IndexShard::DOC_KEY_LEN,
-                IndexShard::DOC_KEY_LEN);
-            if(!$data[0][self::IS_DOC] ||
-                crawlHash($hash_url. "LOCATION", true) == $hash) {
+            $hash = $pre_out_pages[$hash_url][0][self::HASH];
+            $is_location = (crawlHash($hash_url. "LOCATION", true) == $hash);
+            if(!$data[0][self::IS_DOC] || $is_location) {
                 $item = $this->lookupDoc($data[0]['KEY'],
-                    $data[0][self::IS_DOC]);
+                    $is_location);
                 if($item != false) {
                     array_unshift($pre_out_pages[$hash_url], $item);
                 }
@@ -380,15 +379,19 @@ class GroupIterator extends IndexBundleIterator
             $keys = array_keys($doc_array);
             $key = $keys[0];
             $item = $doc_array[$key];
-            if(!$item[self::IS_DOC] && $is_location) {
-                return $this->lookupDoc($key);
+            $hash = substr($key, IndexShard::DOC_KEY_LEN,
+                IndexShard::DOC_KEY_LEN);
+            $is2_location = (crawlHash($hash_url. "LOCATION", true) == $hash);
+            if($is2_location) {
+                return $this->lookupDoc($key, $is2_location);
+            } else if(!isset($item[self::IS_DOC]) || !$item[self::IS_DOC]) {
+                return $this->lookupDoc($key, false);
             }
             $item[self::RELEVANCE] = $relevance;
             $item[self::SCORE] = $item[self::DOC_RANK]*pow(1.1, $relevance);
             $item['KEY'] = $key;
             $item['INDEX'] = $word_iterator->index;
-            $item[self::HASH] = substr($key,
-                IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+            $item[self::HASH] = $hash;
             $item[self::INLINKS] = substr($key,
                 2*IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
         }
ViewGit