Work to get http-equiv refresh and location to work smoothly, a=chris

Chris Pollett [2011-10-24 14:Oct:th]

Work to get http-equiv refresh and location to work smoothly, a=chris

Filename
bin/fetcher.php
lib/fetch_url.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/word_iterator.php
lib/processors/html_processor.php
models/phrase_model.php

diff --git a/bin/fetcher.php b/bin/fetcher.php
index d2482a904..3d0ee70be 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -871,12 +871,13 @@ class Fetcher implements CrawlConstants

         foreach($site_pages as $site) {
             $response_code = $site[self::HTTP_CODE];
-
-	    //deals with short URLs and directs them to the original link
-	    if(isset($site[self::LOCATION]))
-	    {
-	    	$site[self::URL]=$site[self::LOCATION];
-	    }
+
+            //deals with short URLs and directs them to the original link
+            if(isset($site[self::LOCATION]) &&
+                count($site[self::LOCATION]) > 0) {
+                array_unshift($site[self::LOCATION], $site[self::URL]);
+                $site[self::URL] = array_pop($site[self::LOCATION]);
+            }

             //process robot.txt files separately
             if(isset($site[self::ROBOT_PATHS])) {
@@ -930,7 +931,7 @@ class Fetcher implements CrawlConstants
                     $site[self::ENCODING] != "" &&
                     ($page_processor == "TextProcessor" ||
                     is_subclass_of($page_processor, "TextProcessor"))) {
-                    if(!mb_check_encoding($site[self::PAGE],
+                    if(!@mb_check_encoding($site[self::PAGE],
                         $site[self::ENCODING])) {
                         crawlLog("  NOT VALID ENCODING DETECTED!!");
                     }
@@ -945,8 +946,13 @@ class Fetcher implements CrawlConstants
             } else {
                 $doc_info = false;
             }
+
             if($doc_info) {
                 $site[self::DOC_INFO] =  $doc_info;
+                if(isset($doc_info[self::LOCATION])) {
+                    $site[self::HASH] = crawlHash(
+                        crawlHash($site[self::URL], true). "LOCATION", true);
+                }
                 $site[self::ROBOT_INSTANCE] = ROBOT_INSTANCE;

                 if(!is_dir(CRAWL_DIR."/cache")) {
@@ -999,7 +1005,6 @@ class Fetcher implements CrawlConstants
                     $this->processSubdocs($i, $site, $summarized_site_pages,
                        $stored_site_pages);
                 }
-
                 $i++;
             }
         } // end for
@@ -1038,7 +1043,8 @@ class Fetcher implements CrawlConstants
         $summary_fields = array(self::IP_ADDRESSES, self::WEIGHT,
             self::TIMESTAMP, self::TYPE, self::ENCODING, self::HTTP_CODE,
             self::HASH, self::SERVER, self::SERVER_VERSION,
-            self::OPERATING_SYSTEM, self::MODIFIED, self::ROBOT_INSTANCE);
+            self::OPERATING_SYSTEM, self::MODIFIED, self::ROBOT_INSTANCE,
+            self::LOCATION);

         foreach($summary_fields as $field) {
             if(isset($site[$field])) {
@@ -1504,8 +1510,15 @@ class Fetcher implements CrawlConstants

             foreach($site[self::LINKS] as $url => $link_text) {
                 $link_meta_ids = array();
+                $location_link = false;
                 if(strlen($url) > 0) {
                     $summary = array();
+                    if(substr($link_text, 0, 9) == "location:") {
+                        $location_link = true;
+                        $link_meta_ids[] = $link_text;
+                        $link_meta_ids[] = "location:".
+                            crawlHash($site[self::URL]);
+                    }
                     $elink_flag = (UrlParser::getHost($url) !=
                         UrlParser::getHost($site[self::URL])) ? true : false;
                     $had_links = true;
@@ -1540,7 +1553,8 @@ class Fetcher implements CrawlConstants
                         mb_ereg_replace(PUNCT, " ", $link_text);
                     $link_word_lists =
                         PhraseParser::extractPhrasesInLists($link_text,
-                            MAX_PHRASE_LEN, $lang);
+                        MAX_PHRASE_LEN, $lang);
+
                     $index_shard->addDocumentWords($link_keys,
                         self::NEEDS_OFFSET_FLAG,
                         $link_word_lists, $link_meta_ids, false, $link_rank);
@@ -1596,6 +1610,13 @@ class Fetcher implements CrawlConstants
         $meta_ids[] = 'info:'.$site[self::URL];
         $meta_ids[] = 'info:'.crawlHash($site[self::URL]);
         $meta_ids[] = 'site:all';
+        if(isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0){
+            foreach($site[self::LOCATION] as $location) {
+                $meta_ids[] = 'info:'.$location;
+                $meta_ids[] = 'info:'.crawlHash($location);
+                $meta_ids[] = 'location:'.$location;
+            }
+        }

         foreach($site[self::IP_ADDRESSES] as $address) {
             $meta_ids[] = 'ip:'.$address;
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index 3bb840f31..94b9cbb16 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -229,6 +229,8 @@ class FetchUrl implements CrawlConstants
     {
         $new_offset = 0;
         // header will include all redirect headers
+        $site = array();
+        $site[CrawlConstants::LOCATION] = array();
         do {
             $CRLFCRLF = strpos($header_and_page, "\x0D\x0A\x0D\x0A",
                 $new_offset);
@@ -238,14 +240,27 @@ class FetchUrl implements CrawlConstants
             $header_offset = ($CRLFCRLF > 0) ? $CRLFCRLF : $LFLF;
             $new_offset = ($CRLFCRLF > 0) ? $header_offset + 4
                 : $header_offset + 2;
-            $redirect_pos = strpos($header_and_page, 'Location:', $old_offset);
+            $redirect_pos = stripos($header_and_page, 'Location:', $old_offset);
+            $redirect_str = "Location:";
+            if($redirect_pos == false) {
+                $redirect_pos =
+                    stripos($header_and_page, 'Refresh:', $old_offset);
+                $redirect_str = "Refresh:";
+            }
             if(isset($header_and_page[$redirect_pos - 1]) &&
                 ord($header_and_page[$redirect_pos - 1]) > 32) {
                 $redirect_pos = $new_offset; //ignore X-XRDS-Location header
+            } else if($redirect_pos !== false){
+                $redirect_pos += strlen($redirect_str);
+                $pre_line = substr($header_and_page, $redirect_pos,
+                    strpos($header_and_page, "\n", $redirect_pos) -
+                    $redirect_pos);
+                $site[CrawlConstants::LOCATION][] = @trim($pre_line);
+
             }
         } while($redirect_pos !== false && $redirect_pos < $new_offset);

-        $site = array();
+
         $site[CrawlConstants::HEADER] =
             substr($header_and_page, 0, $header_offset);
         $site[$value] = ltrim(substr($header_and_page, $header_offset));
@@ -281,11 +296,6 @@ class FetchUrl implements CrawlConstants
                 $site[CrawlConstants::MODIFIED] =
                     strtotime(@trim($line_parts[1]));
             }
-  	    if(stristr($line,'Location:')){
-	    	$line_parts=explode("Location:",line);
-		$site[CrawlConstants::LOCATION]=@trim(line_parts[1]);
-	    }
-
         }
         if(!isset($site[CrawlConstants::ENCODING]) ) {
         //first guess we are html and try to find charset in doc head
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index d998f934a..767b39ba4 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -316,8 +316,12 @@ class GroupIterator extends IndexBundleIterator
     {
         $domain_vector = array();
         foreach($pre_out_pages as $hash_url => $data) {
-            if(!$data[0][self::IS_DOC]) {
-                $item = $this->lookupDoc($data[0]['KEY']);
+            $hash = substr($data[0]['KEY'], IndexShard::DOC_KEY_LEN,
+                IndexShard::DOC_KEY_LEN);
+            if(!$data[0][self::IS_DOC] ||
+                crawlHash($hash_url. "LOCATION", true) == $hash) {
+                $item = $this->lookupDoc($data[0]['KEY'],
+                    $data[0][self::IS_DOC]);
                 if($item != false) {
                     array_unshift($pre_out_pages[$hash_url], $item);
                 }
@@ -350,14 +354,16 @@ class GroupIterator extends IndexBundleIterator
      * Looks up a doc for a link doc_key, so can get its summary info
      *
      * @param string $doc_key key to look up doc of
+     * @param bool $is_location we are doing look up because doc had a refresh
      *
      * @return array consisting of info about the doc
      */
-     function lookupDoc($doc_key)
+     function lookupDoc($doc_key, $is_location = false)
      {
         $hash_url = substr($doc_key, 0, IndexShard::DOC_KEY_LEN);
+        $prefix = ($is_location) ? "location:" : "info:";
         $hash_info_url=
-            crawlHash("info:".base64Hash($hash_url), true);
+            crawlHash($prefix.base64Hash($hash_url), true);
         $index = $this->getIndex($doc_key);
         $word_iterator =
              new WordIterator($hash_info_url,
@@ -371,6 +377,9 @@ class GroupIterator extends IndexBundleIterator
             $keys = array_keys($doc_array);
             $key = $keys[0];
             $item = $doc_array[$key];
+            if(!$item[self::IS_DOC] && $is_location) {
+                return $this->lookupDoc($key);
+            }
             $item[self::RELEVANCE] = $relevance;
             $item[self::SCORE] = $item[self::DOC_RANK]*pow(1.1, $relevance);
             $item['KEY'] = $key;
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index f363ca016..0fd17b6eb 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -168,7 +168,6 @@ class WordIterator extends IndexBundleIterator
         $this->current_block_fresh = false;
         $this->dictionary_info =
             $index->dictionary->getWordInfo($word_key, true);
-
         if ($this->dictionary_info === false) {
             $this->empty = true;
         } else {
@@ -253,7 +252,6 @@ class WordIterator extends IndexBundleIterator
         $results = $shard->getPostingsSlice(
             $this->start_offset,
             $this->next_offset, $this->last_offset, $this->results_per_block);
-
         if($this->filter != NULL) {
             foreach($results as $keys => $data) {
                 $host_key = substr($keys, self::HOST_KEY_POS, self::KEY_LEN);
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index 5c216c108..9c76a3463 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -79,9 +79,18 @@ class HtmlProcessor extends TextProcessor
                 $summary[self::LANG] = self::lang($dom,
                     $summary[self::DESCRIPTION], $url);
                 $summary[self::LINKS] = self::links($dom, $url);
+                $location = self::location($dom, $url);
+                if($location) {
+                    $summary[self::LINKS][$location] = "location:".$url;
+                    $summary[self::LOCATION] = true;
+                    $summary[self::DESCRIPTION] .= $url." => ".$location;
+                    if(!$summary[self::TITLE]) {
+                        $summary[self::TITLE] = $url;
+                    }
+                }
                 $summary[self::PAGE] = $page;
                 if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
-                    == 0 && count($summary[self::LINKS]) == 0) {
+                    == 0 && count($summary[self::LINKS]) == 0 && !$location) {
                     //maybe not html? treat as text still try to get urls
                     $summary = parent::process($page, $url);
                 }
@@ -117,8 +126,9 @@ class HtmlProcessor extends TextProcessor
                 "<h1><h2><h3><h4><h5><h6><p><div>".
                 "<a><table><tr><td><th>";
             $body = strip_tags($page, $body_tags);
-            $page = "<html><head>$head</head><body>$body</body>";
+            $page = "<html><head>$head</head><body>$body</body></html>";
         }
+
         $dom = new DOMDocument();

         //this hack modified from php.net
@@ -230,7 +240,7 @@ class HtmlProcessor extends TextProcessor

         //look for a meta tag with a description
         foreach($metas as $meta) {
-            if(mb_stristr($meta->getAttribute('name'), "description")) {
+            if(stristr($meta->getAttribute('name'), "description")) {
                 $description .= " ".$meta->getAttribute('content');
             }
         }
@@ -255,6 +265,29 @@ class HtmlProcessor extends TextProcessor
         return $description;
     }

+    /**
+     *
+     */
+    static function location($dom, $site)
+    {
+        $xpath = new DOMXPath($dom);
+        //Look for Refresh or Location
+        $metas = $xpath->evaluate("/html//meta");
+        foreach($metas as $meta) {
+            if(stristr($meta->getAttribute('http-equiv'), "refresh") ||
+               stristr($meta->getAttribute('http-equiv'), "location")) {
+                $urls = explode("=", $meta->getAttribute('content'));
+                if(isset($urls[1]) && !UrlParser::checkRecursiveUrl($urls[1]) &&
+                    strlen($urls[1]) < MAX_URL_LENGTH) {
+                    $url = @trim($urls[1]);
+                    return $url;
+                }
+            }
+        }
+
+        return false;
+    }
+
     /**
      * Returns up to MAX_LINK_PER_PAGE many links from the supplied
      * dom object where links have been canonicalized according to
@@ -270,6 +303,7 @@ class HtmlProcessor extends TextProcessor
         $sites = array();

         $xpath = new DOMXPath($dom);
+
         $base_refs = $xpath->evaluate("/html//base");
         if($base_refs->item(0)) {
             $tmp_site = $base_refs->item(0)->getAttribute('href');
@@ -278,9 +312,10 @@ class HtmlProcessor extends TextProcessor
             }
         }

+        $i = 0;
+
         $hrefs = $xpath->evaluate("/html/body//a");

-        $i = 0;

         foreach($hrefs as $href) {
             if($i < MAX_LINKS_PER_PAGE) {
diff --git a/models/phrase_model.php b/models/phrase_model.php
index e5b34f096..42205d40d 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -471,7 +471,7 @@ class PhraseModel extends Model
         $meta_words = array('link:', 'site:', 'version:', 'modified:',
             'filetype:', 'info:', '\-', 'os:', 'server:', 'date:',
             'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:',
-            'lang:', 'media:', 'elink:');
+            'lang:', 'media:', 'elink:', 'location:');
         if(isset($this->additional_meta_words)) {
             $meta_words = array_merge($meta_words, array_keys(
                 $this->additional_meta_words));

ViewGit