diff --git a/bin/fetcher.php b/bin/fetcher.php index a3a1517c8..976241349 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -407,7 +407,7 @@ class Fetcher implements CrawlConstants $this->indexed_file_types = $indexed_file_types; $this->all_file_types = $indexed_file_types; - $this->restrict_sites_by_url = true; + $this->restrict_sites_by_url = false; $this->allowed_sites = array(); $this->disallowed_sites = array(); @@ -806,6 +806,7 @@ class Fetcher implements CrawlConstants "&crawl_time=$crawl_time"; $info_string = FetchUrl::getPage($request); $info = @unserialize(trim($info_string)); + if(isset($info[self::SAVED_CRAWL_TIMES])) { if(array_diff($info[self::SAVED_CRAWL_TIMES], $saved_crawl_times) != array() || @@ -865,6 +866,7 @@ class Fetcher implements CrawlConstants $this->arc_dir = ''; $this->arc_type = ''; } + $this->setCrawlParamsFromArray($info); // Load any batch that might exist for changed-to crawl if(file_exists("$dir/$prefix".self::fetch_crawl_info. "{$this->crawl_time}.txt") && file_exists( @@ -1046,11 +1048,17 @@ class Fetcher implements CrawlConstants return $info; } + /** + * + */ function exceedMemoryThreshold() { return memory_get_usage() > (metricToInt(ini_get("memory_limit")) * 0.7); } + /** + * + */ function selectCurrentServerAndUpdateIfNeeded($at_least_once) { $i = 0; @@ -1098,18 +1106,19 @@ class Fetcher implements CrawlConstants } else { $info[self::CURRENT_SERVER] = $this->current_server; } - if(isset($info[self::CRAWL_TYPE])) { - $this->crawl_type = $info[self::CRAWL_TYPE]; - } - if(isset($info[self::CRAWL_INDEX])) { - $this->crawl_index = $info[self::CRAWL_INDEX]; - } - if(isset($info[self::CRAWL_ORDER])) { - $this->crawl_order = $info[self::CRAWL_ORDER]; - } - if(isset($info[self::CACHE_PAGES])) { - $this->cache_pages= $info[self::CACHE_PAGES]; + $update_fields = array(self::CRAWL_TYPE => "crawl_type", + self::CRAWL_INDEX => "crawl_index", self::CRAWL_ORDER => + 'crawl_order', self::CACHE_PAGES => 'cache_pages', + self::INDEXED_FILE_TYPES => 'indexed_file_types', + self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url', + self::ALLOWED_SITES => 'allowed_sites', + self::DISALLOWED_SITES => 'disallowed_sites'); + foreach($update_fields as $info_field => $field) { + if(isset($info[$info_field])) { + $this->$field = $info[$info_field]; + } } + if(isset($info[self::PAGE_RULES]) ){ $rule_string = implode("\n", $info[self::PAGE_RULES]); $rule_string = html_entity_decode($rule_string, ENT_QUOTES); @@ -1379,6 +1388,10 @@ class Fetcher implements CrawlConstants crawlLog(" Using Processor...".$page_processor); $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]); + if($page_processor != "RobotProcessor" && + !isset($doc_info[self::JUST_METAS])) { + $this->pruneLinks($doc_info); + } } else if(!$handled) { $doc_info = false; } @@ -1515,6 +1528,52 @@ class Fetcher implements CrawlConstants return $summarized_site_pages; } + /** + * + */ + function pruneLinks(&$doc_info) + { + if(!isset($doc_info[self::LINKS])) { + return; + } + + $links = array(); + foreach($doc_info[self::LINKS] as $url => $text) { + $doc_type = UrlParser::getDocumentType($url); + if(!in_array($doc_type, $this->all_file_types)) { + $doc_type = "unknown"; + } + if(!in_array($doc_type, $this->indexed_file_types)) { + continue; + } + if($this->restrict_sites_by_url) { + if(!UrlParser::urlMemberSiteArray($url, $this->allowed_sites)) { + continue; + } + } + if(UrlParser::urlMemberSiteArray($url, $this->disallowed_sites)) { + continue; + } + $links[$url] = $text; + } + if(count($links) <= MAX_LINKS_PER_PAGE) { + $doc_info[self::LINKS] = $links; + return; + } + $info_link = array(); + // choose the MAX_LINKS_PER_PAGE many pages with most info (crude) + foreach($links as $url => $text) { + $info_link[$url] = strlen(gzcompress($text)); + } + arsort($info_link); + $link_urls = array_keys(array_slice($info_link, 0, MAX_LINKS_PER_PAGE)); + $doc_info[self::LINKS] = array(); + foreach($link_urls as $url) { + $doc_info[self::LINKS][$url] = $links[$url]; + } + } + + /** * Copies fields from the array of site data to the $i indexed * element of the $summarized_site_pages and $stored_site_pages array diff --git a/configs/config.php b/configs/config.php index 297d8c2f6..150852d38 100644 --- a/configs/config.php +++ b/configs/config.php @@ -259,7 +259,7 @@ define('URL_FILTER_SIZE', 20000000); */ define('NUM_URLS_QUEUE_RAM', 300000); -/** Minimum weight in priority queue before rebuilt*/ +/** Minimum weight in priority queue before rebuilt */ define('MIN_QUEUE_WEIGHT', 1/100000); /** largest sized object allowedin a web archive (used to sanity check @@ -273,11 +273,14 @@ define('NUM_DOCS_PER_GENERATION', 50000); /** precision to round floating points document scores */ define('PRECISION', 10); -/** maximum number of links to consider on any given page */ +/** maximum number of links to extract from a page on an initial pass*/ +define('MAX_LINKS_TO_EXTRACT', 300); + +/** maximum number of links to keep after initial extraction*/ define('MAX_LINKS_PER_PAGE', 50); /** maximum number of links to consider from a sitemap page */ -define('MAX_LINKS_PER_SITEMAP', 200); +define('MAX_LINKS_PER_SITEMAP', 300); /** maximum number of words from links to consider on any given page */ define('MAX_LINKS_WORD_TEXT', 100); diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index 2941994f1..50647d855 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -334,7 +334,7 @@ class HtmlProcessor extends TextProcessor } /** - * Returns up to MAX_LINK_PER_PAGE many links from the supplied + * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied * dom object where links have been canonicalized according to * the supplied $site information. * @@ -363,7 +363,7 @@ class HtmlProcessor extends TextProcessor foreach($hrefs as $href) { - if($i < MAX_LINKS_PER_PAGE) { + if($i < MAX_LINKS_TO_EXTRACT) { $rel = $href->getAttribute("rel"); if($rel == "" || !stristr($rel, "nofollow")) { $url = UrlParser::canonicalLink( @@ -386,7 +386,7 @@ class HtmlProcessor extends TextProcessor $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe"); foreach($frames as $frame) { - if($i < MAX_LINKS_PER_PAGE) { + if($i < MAX_LINKS_TO_EXTRACT) { $url = UrlParser::canonicalLink( $frame->getAttribute('src'), $site); @@ -408,7 +408,7 @@ class HtmlProcessor extends TextProcessor $i = 0; foreach($imgs as $img) { - if($i < MAX_LINKS_PER_PAGE) { + if($i < MAX_LINKS_TO_EXTRACT) { $alt = $img->getAttribute('alt'); if(strlen($alt) < 1) { continue; } diff --git a/lib/processors/pptx_processor.php b/lib/processors/pptx_processor.php index 05da3047b..470a5387f 100644 --- a/lib/processors/pptx_processor.php +++ b/lib/processors/pptx_processor.php @@ -155,7 +155,7 @@ class PptxProcessor extends TextProcessor $i=0; foreach($paras as $para) { - if($i < MAX_LINKS_PER_PAGE) { + if($i < MAX_LINKS_TO_EXTRACT) { $hlink = $para->parentNode->parentNode-> getElementsByTagName("t")->item(0)->nodeValue; @@ -262,4 +262,4 @@ class PptxProcessor extends TextProcessor } } -?> \ No newline at end of file +?> diff --git a/lib/processors/rss_processor.php b/lib/processors/rss_processor.php index 9a87a9554..9fbb539fa 100644 --- a/lib/processors/rss_processor.php +++ b/lib/processors/rss_processor.php @@ -252,7 +252,7 @@ class RssProcessor extends TextProcessor $sites[$url] = $text; $i++; } - if($i >= MAX_LINKS_PER_PAGE) { + if($i >= MAX_LINKS_TO_EXTRACT) { break 2; } } diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php index 0e06f3080..e61d002c6 100755 --- a/lib/processors/text_processor.php +++ b/lib/processors/text_processor.php @@ -180,7 +180,7 @@ class TextProcessor extends PageProcessor if(!isset($sites[$url]) && strlen($url) < MAX_URL_LENGTH) { $sites[$url] = strip_tags($url); $i++; - if($i >= MAX_LINKS_PER_PAGE) {break;} + if($i >= MAX_LINKS_TO_EXTRACT) {break;} } } return $sites; @@ -224,4 +224,4 @@ class TextProcessor extends PageProcessor } } -?> \ No newline at end of file +?> diff --git a/lib/processors/xlsx_processor.php b/lib/processors/xlsx_processor.php index 1640412b9..fc8224027 100644 --- a/lib/processors/xlsx_processor.php +++ b/lib/processors/xlsx_processor.php @@ -241,7 +241,7 @@ class XlsxProcessor extends TextProcessor if( strcmp( $relation->getAttribute('Type'), $hyperlink) == 0 ) { - if($i < MAX_LINKS_PER_PAGE) { + if($i < MAX_LINKS_TO_EXTRACT) { $link = $relation->getAttribute('Target'); $url = UrlParser::canonicalLink( $link, $site); @@ -264,4 +264,4 @@ class XlsxProcessor extends TextProcessor } -?> \ No newline at end of file +?> diff --git a/models/machine_model.php b/models/machine_model.php index 5c9a02223..a5dd6d16a 100644 --- a/models/machine_model.php +++ b/models/machine_model.php @@ -293,7 +293,6 @@ class MachineModel extends Model } else { $url .= "&queue_server=$value"; } -echo $url; echo FetchUrl::getPage($url); } $this->db->execute("COMMIT"); diff --git a/tests/url_parser_test.php b/tests/url_parser_test.php index 77c3002f3..99ff2d96a 100644 --- a/tests/url_parser_test.php +++ b/tests/url_parser_test.php @@ -202,7 +202,8 @@ class UrlParserTest extends UnitTest $sites = array("http://www.example.com/", "http://www.cs.sjsu.edu/faculty/pollett/*/*/", "http://www.bing.com/video/search?*&*&", - "http://*.cool.*/a/*/", "domain:ucla.edu"); + "http://*.cool.*/a/*/", "domain:ucla.edu", + "domain:foodnetwork.com"); $test_urls = array( array("http://www.cs.sjsu.edu/faculty/pollett/", false, "regex url negative 1"), @@ -220,6 +221,8 @@ class UrlParserTest extends UnitTest "regex url positive 3"), array("http://test.ucla.edu", true, "domain test positive"), + array("http://www.foodnetworkstore.com/small-appliances/", false, + "domain test negative"), ); foreach($test_urls as $test_url) { $result = UrlParser::urlMemberSiteArray($test_url[0], $sites);