First pass fetcher-based link pruning, a=chris

Chris Pollett [2013-03-01 02:Mar:st]

First pass fetcher-based link pruning, a=chris

Filename
bin/fetcher.php
configs/config.php
lib/processors/html_processor.php
lib/processors/pptx_processor.php
lib/processors/rss_processor.php
lib/processors/text_processor.php
lib/processors/xlsx_processor.php
models/machine_model.php
tests/url_parser_test.php

diff --git a/bin/fetcher.php b/bin/fetcher.php
index a3a1517c8..976241349 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -407,7 +407,7 @@ class Fetcher implements CrawlConstants

         $this->indexed_file_types = $indexed_file_types;
         $this->all_file_types = $indexed_file_types;
-        $this->restrict_sites_by_url = true;
+        $this->restrict_sites_by_url = false;
         $this->allowed_sites = array();
         $this->disallowed_sites = array();

@@ -806,6 +806,7 @@ class Fetcher implements CrawlConstants
             "&crawl_time=$crawl_time";
         $info_string = FetchUrl::getPage($request);
         $info = @unserialize(trim($info_string));
+
         if(isset($info[self::SAVED_CRAWL_TIMES])) {
             if(array_diff($info[self::SAVED_CRAWL_TIMES], $saved_crawl_times)
                 != array() ||
@@ -865,6 +866,7 @@ class Fetcher implements CrawlConstants
                 $this->arc_dir = '';
                 $this->arc_type = '';
             }
+            $this->setCrawlParamsFromArray($info);
             // Load any batch that might exist for changed-to crawl
             if(file_exists("$dir/$prefix".self::fetch_crawl_info.
                 "{$this->crawl_time}.txt") && file_exists(
@@ -1046,11 +1048,17 @@ class Fetcher implements CrawlConstants
         return $info;
     }

+    /**
+     *
+     */
     function exceedMemoryThreshold()
     {
        return memory_get_usage() > (metricToInt(ini_get("memory_limit")) * 0.7);
     }

+    /**
+     *
+     */
     function selectCurrentServerAndUpdateIfNeeded($at_least_once)
     {
         $i = 0;
@@ -1098,18 +1106,19 @@ class Fetcher implements CrawlConstants
         } else {
             $info[self::CURRENT_SERVER] = $this->current_server;
         }
-        if(isset($info[self::CRAWL_TYPE])) {
-            $this->crawl_type = $info[self::CRAWL_TYPE];
-        }
-        if(isset($info[self::CRAWL_INDEX])) {
-            $this->crawl_index = $info[self::CRAWL_INDEX];
-        }
-        if(isset($info[self::CRAWL_ORDER])) {
-            $this->crawl_order = $info[self::CRAWL_ORDER];
-        }
-        if(isset($info[self::CACHE_PAGES])) {
-            $this->cache_pages= $info[self::CACHE_PAGES];
+        $update_fields = array(self::CRAWL_TYPE => "crawl_type",
+            self::CRAWL_INDEX => "crawl_index", self::CRAWL_ORDER =>
+            'crawl_order', self::CACHE_PAGES => 'cache_pages',
+            self::INDEXED_FILE_TYPES => 'indexed_file_types',
+            self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url',
+            self::ALLOWED_SITES => 'allowed_sites',
+            self::DISALLOWED_SITES => 'disallowed_sites');
+        foreach($update_fields as $info_field => $field) {
+            if(isset($info[$info_field])) {
+                $this->$field = $info[$info_field];
+            }
         }
+
         if(isset($info[self::PAGE_RULES]) ){
             $rule_string = implode("\n", $info[self::PAGE_RULES]);
             $rule_string = html_entity_decode($rule_string, ENT_QUOTES);
@@ -1379,6 +1388,10 @@ class Fetcher implements CrawlConstants
                 crawlLog("  Using Processor...".$page_processor);
                 $doc_info = $processor->handle($site[self::PAGE],
                     $site[self::URL]);
+                if($page_processor != "RobotProcessor" &&
+                    !isset($doc_info[self::JUST_METAS])) {
+                    $this->pruneLinks($doc_info);
+                }
             } else if(!$handled) {
                 $doc_info = false;
             }
@@ -1515,6 +1528,52 @@ class Fetcher implements CrawlConstants
         return $summarized_site_pages;
     }

+    /**
+     *
+     */
+    function pruneLinks(&$doc_info)
+    {
+        if(!isset($doc_info[self::LINKS])) {
+            return;
+        }
+
+        $links = array();
+        foreach($doc_info[self::LINKS] as $url => $text) {
+            $doc_type = UrlParser::getDocumentType($url);
+            if(!in_array($doc_type, $this->all_file_types)) {
+                $doc_type = "unknown";
+            }
+            if(!in_array($doc_type, $this->indexed_file_types)) {
+                continue;
+            }
+            if($this->restrict_sites_by_url) {
+                if(!UrlParser::urlMemberSiteArray($url, $this->allowed_sites)) {
+                    continue;
+                }
+            }
+            if(UrlParser::urlMemberSiteArray($url, $this->disallowed_sites)) {
+                continue;
+            }
+            $links[$url] = $text;
+        }
+        if(count($links) <= MAX_LINKS_PER_PAGE) {
+            $doc_info[self::LINKS] = $links;
+            return;
+        }
+        $info_link = array();
+        // choose the MAX_LINKS_PER_PAGE many pages with most info (crude)
+        foreach($links as $url => $text) {
+            $info_link[$url] = strlen(gzcompress($text));
+        }
+        arsort($info_link);
+        $link_urls = array_keys(array_slice($info_link, 0, MAX_LINKS_PER_PAGE));
+        $doc_info[self::LINKS] = array();
+        foreach($link_urls as $url) {
+            $doc_info[self::LINKS][$url] = $links[$url];
+        }
+    }
+
+
     /**
      * Copies fields from the array of site data to the $i indexed
      * element of the $summarized_site_pages and $stored_site_pages array
diff --git a/configs/config.php b/configs/config.php
index 297d8c2f6..150852d38 100644
--- a/configs/config.php
+++ b/configs/config.php
@@ -259,7 +259,7 @@ define('URL_FILTER_SIZE', 20000000);
  */
 define('NUM_URLS_QUEUE_RAM', 300000);

-/** Minimum weight in priority queue before rebuilt*/
+/** Minimum weight in priority queue before rebuilt */
 define('MIN_QUEUE_WEIGHT', 1/100000);

 /**  largest sized object allowedin a web archive (used to sanity check
@@ -273,11 +273,14 @@ define('NUM_DOCS_PER_GENERATION', 50000);
 /** precision to round floating points document scores */
 define('PRECISION', 10);

-/** maximum number of links to consider on any given page */
+/** maximum number of links to extract from a page on an initial pass*/
+define('MAX_LINKS_TO_EXTRACT', 300);
+
+/** maximum number of links to keep after initial extraction*/
 define('MAX_LINKS_PER_PAGE', 50);

 /** maximum number of links to consider from a sitemap page */
-define('MAX_LINKS_PER_SITEMAP', 200);
+define('MAX_LINKS_PER_SITEMAP', 300);

 /**  maximum number of words from links to consider on any given page */
 define('MAX_LINKS_WORD_TEXT', 100);
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index 2941994f1..50647d855 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -334,7 +334,7 @@ class HtmlProcessor extends TextProcessor
     }

     /**
-     * Returns up to MAX_LINK_PER_PAGE many links from the supplied
+     * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied
      * dom object where links have been canonicalized according to
      * the supplied $site information.
      *
@@ -363,7 +363,7 @@ class HtmlProcessor extends TextProcessor


         foreach($hrefs as $href) {
-            if($i < MAX_LINKS_PER_PAGE) {
+            if($i < MAX_LINKS_TO_EXTRACT) {
                 $rel = $href->getAttribute("rel");
                 if($rel == "" || !stristr($rel, "nofollow")) {
                     $url = UrlParser::canonicalLink(
@@ -386,7 +386,7 @@ class HtmlProcessor extends TextProcessor

         $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe");
         foreach($frames as $frame) {
-            if($i < MAX_LINKS_PER_PAGE) {
+            if($i < MAX_LINKS_TO_EXTRACT) {
                 $url = UrlParser::canonicalLink(
                     $frame->getAttribute('src'), $site);

@@ -408,7 +408,7 @@ class HtmlProcessor extends TextProcessor
         $i = 0;

         foreach($imgs as $img) {
-            if($i < MAX_LINKS_PER_PAGE) {
+            if($i < MAX_LINKS_TO_EXTRACT) {
                 $alt = $img->getAttribute('alt');

                 if(strlen($alt) < 1) { continue; }
diff --git a/lib/processors/pptx_processor.php b/lib/processors/pptx_processor.php
index 05da3047b..470a5387f 100644
--- a/lib/processors/pptx_processor.php
+++ b/lib/processors/pptx_processor.php
@@ -155,7 +155,7 @@ class PptxProcessor extends TextProcessor
         $i=0;

         foreach($paras as $para) {
-            if($i < MAX_LINKS_PER_PAGE) {
+            if($i < MAX_LINKS_TO_EXTRACT) {
                 $hlink = $para->parentNode->parentNode->
                     getElementsByTagName("t")->item(0)->nodeValue;

@@ -262,4 +262,4 @@ class PptxProcessor extends TextProcessor
     }

 }
-?>
\ No newline at end of file
+?>
diff --git a/lib/processors/rss_processor.php b/lib/processors/rss_processor.php
index 9a87a9554..9fbb539fa 100644
--- a/lib/processors/rss_processor.php
+++ b/lib/processors/rss_processor.php
@@ -252,7 +252,7 @@ class RssProcessor extends TextProcessor
                     $sites[$url] = $text;
                     $i++;
                 }
-                if($i >= MAX_LINKS_PER_PAGE) {
+                if($i >= MAX_LINKS_TO_EXTRACT) {
                     break 2;
                 }
             }
diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index 0e06f3080..e61d002c6 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -180,7 +180,7 @@ class TextProcessor extends PageProcessor
             if(!isset($sites[$url]) && strlen($url) < MAX_URL_LENGTH) {
                 $sites[$url] = strip_tags($url);
                 $i++;
-                if($i >= MAX_LINKS_PER_PAGE) {break;}
+                if($i >= MAX_LINKS_TO_EXTRACT) {break;}
             }
         }
         return $sites;
@@ -224,4 +224,4 @@ class TextProcessor extends PageProcessor
     }
 }

-?>
\ No newline at end of file
+?>
diff --git a/lib/processors/xlsx_processor.php b/lib/processors/xlsx_processor.php
index 1640412b9..fc8224027 100644
--- a/lib/processors/xlsx_processor.php
+++ b/lib/processors/xlsx_processor.php
@@ -241,7 +241,7 @@ class XlsxProcessor extends TextProcessor
                 if( strcmp( $relation->getAttribute('Type'),
                     $hyperlink) == 0 ) {

-                    if($i < MAX_LINKS_PER_PAGE) {
+                    if($i < MAX_LINKS_TO_EXTRACT) {
                         $link = $relation->getAttribute('Target');
                         $url = UrlParser::canonicalLink(
                             $link, $site);
@@ -264,4 +264,4 @@ class XlsxProcessor extends TextProcessor

 }

-?>
\ No newline at end of file
+?>
diff --git a/models/machine_model.php b/models/machine_model.php
index 5c9a02223..a5dd6d16a 100644
--- a/models/machine_model.php
+++ b/models/machine_model.php
@@ -293,7 +293,6 @@ class MachineModel extends Model
             } else {
                 $url .= "&queue_server=$value";
             }
-echo $url;
             echo FetchUrl::getPage($url);
         }
         $this->db->execute("COMMIT");
diff --git a/tests/url_parser_test.php b/tests/url_parser_test.php
index 77c3002f3..99ff2d96a 100644
--- a/tests/url_parser_test.php
+++ b/tests/url_parser_test.php
@@ -202,7 +202,8 @@ class UrlParserTest extends UnitTest
         $sites = array("http://www.example.com/",
             "http://www.cs.sjsu.edu/faculty/pollett/*/*/",
             "http://www.bing.com/video/search?*&*&",
-            "http://*.cool.*/a/*/", "domain:ucla.edu");
+            "http://*.cool.*/a/*/", "domain:ucla.edu",
+            "domain:foodnetwork.com");
        $test_urls = array(
             array("http://www.cs.sjsu.edu/faculty/pollett/", false,
                 "regex url negative 1"),
@@ -220,6 +221,8 @@ class UrlParserTest extends UnitTest
                 "regex url positive 3"),
             array("http://test.ucla.edu", true,
                 "domain test positive"),
+            array("http://www.foodnetworkstore.com/small-appliances/", false,
+                "domain test negative"),
         );
         foreach($test_urls as $test_url) {
             $result = UrlParser::urlMemberSiteArray($test_url[0], $sites);

ViewGit