diff --git a/src/configs/Config.php b/src/configs/Config.php index 5abd4ad22..960ec2eef 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -744,12 +744,8 @@ nsconddefine('NUM_DOCS_PER_GENERATION', MEMORY_PROFILE * 10000); nsconddefine('PRECISION', 10); /** maximum number of links to extract from a page on an initial pass*/ nsconddefine('MAX_LINKS_TO_EXTRACT', MEMORY_PROFILE * 80); -/** maximum number of links to keep after initial extraction*/ -nsconddefine('MAX_LINKS_PER_PAGE', 50); /** Estimate of the average number of links per page a document has*/ nsconddefine('AVG_LINKS_PER_PAGE', 24); -/** maximum number of links to consider from a sitemap page */ -nsconddefine('MAX_LINKS_PER_SITEMAP', MEMORY_PROFILE * 80); /** minimum char length of link text before gets its own document */ nsconddefine('MIN_LINKS_TEXT_CHARS', 3); /** maximum number of chars for link text to use for any given url on diff --git a/src/configs/default_crawl.ini b/src/configs/default_crawl.ini index 93ed77ca8..d7cc7462c 100644 --- a/src/configs/default_crawl.ini +++ b/src/configs/default_crawl.ini @@ -36,6 +36,7 @@ arc_type = ""; page_recrawl_frequency = '-1'; page_range_request = '100000'; max_description_len = '10000'; +max_links_to_extract = '50'; cache_pages = true; restrict_sites_by_url = false; diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index 040324009..a9dee6c2e 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -698,36 +698,29 @@ class CrawlComponent extends Component implements CrawlConstants $parent = $this->parent; $crawl_params[self::CRAWL_TYPE] = $seed_info['general']['crawl_type']; $crawl_params[self::CRAWL_INDEX] = - (isset($seed_info['general']['crawl_index'])) ? - $seed_info['general']['crawl_index'] : ''; + $seed_info['general']['crawl_index'] ?? ''; $crawl_params[self::CHANNEL] = - (isset($seed_info['general']['channel'])) ? - $seed_info['general']['channel'] : ''; + $seed_info['general']['channel'] ?? ''; $crawl_params[self::ARC_DIR]= - (isset($seed_info['general']['arc_dir'])) ? - $seed_info['general']['arc_dir'] : ''; + $seed_info['general']['arc_dir'] ?? ''; $crawl_params[self::ARC_TYPE] = - (isset($seed_info['general']['arc_type'])) ? - $seed_info['general']['arc_type'] : ''; + $seed_info['general']['arc_type'] ?? ''; $crawl_params[self::CACHE_PAGES] = - (isset($seed_info['general']['cache_pages'])) ? - intval($seed_info['general']['cache_pages']) : - true; + intval($seed_info['general']['cache_pages']) ?? true; $crawl_params[self::PAGE_RANGE_REQUEST] = - (isset($seed_info['general']['page_range_request'])) ? - intval($seed_info['general']['page_range_request']) : + intval($seed_info['general']['page_range_request']) ?? C\PAGE_RANGE_REQUEST; $crawl_params[self::MAX_DESCRIPTION_LEN] = - (isset($seed_info['general']['max_description_len'])) ? - intval($seed_info['general']['max_description_len']) : + intval($seed_info['general']['max_description_len']) ?? C\MAX_DESCRIPTION_LEN; + $crawl_params[self::MAX_LINKS_TO_EXTRACT] = + intval($seed_info['general']['max_links_to_extract']) ?? + C\MAX_LINKS_TO_EXTRACT; $crawl_params[self::PAGE_RECRAWL_FREQUENCY] = - (isset($seed_info['general']['page_recrawl_frequency'])) ? - intval($seed_info['general']['page_recrawl_frequency']) : + intval($seed_info['general']['page_recrawl_frequency']) ?? C\PAGE_RECRAWL_FREQUENCY; $crawl_params[self::TO_CRAWL] = $seed_info['seed_sites']['url']; - $crawl_params[self::CRAWL_ORDER] = - $seed_info['general']['crawl_order']; + $crawl_params[self::CRAWL_ORDER] = $seed_info['general']['crawl_order']; $crawl_params[self::MAX_DEPTH] = $seed_info['general']['max_depth']; $crawl_params[self::REPEAT_TYPE] = $seed_info['general']['repeat_type']; $crawl_params[self::SLEEP_START] = $seed_info['general']['sleep_start']; @@ -737,11 +730,9 @@ class CrawlComponent extends Component implements CrawlConstants $crawl_params[self::RESTRICT_SITES_BY_URL] = $seed_info['general']['restrict_sites_by_url']; $crawl_params[self::ALLOWED_SITES] = - isset($seed_info['allowed_sites']['url']) ? - $seed_info['allowed_sites']['url'] : []; + $seed_info['allowed_sites']['url'] ?? []; $crawl_params[self::DISALLOWED_SITES] = - isset($seed_info['disallowed_sites']['url']) ? - $seed_info['disallowed_sites']['url'] : []; + $seed_info['disallowed_sites']['url'] ?? []; if (isset($seed_info['indexed_file_types']['extensions'])) { $crawl_params[self::INDEXED_FILE_TYPES] = $seed_info['indexed_file_types']['extensions']; @@ -765,8 +756,7 @@ class CrawlComponent extends Component implements CrawlConstants $seed_info['indexing_plugins']['plugins']; } $crawl_params[self::PAGE_RULES] = - isset($seed_info['page_rules']['rule']) ? - $seed_info['page_rules']['rule'] : []; + $seed_info['page_rules']['rule'] ?? []; } /** * Called from @see manageCrawls to edit the parameters for the next @@ -1064,7 +1054,6 @@ class CrawlComponent extends Component implements CrawlConstants in_array($_REQUEST['robots_txt'], array_keys($data['robots_txt_behaviors']))) { $seed_info['general']['robots_txt'] = $_REQUEST['robots_txt']; - echo $seed_info['general']['robots_txt']; $update_flag = true; } $data['restrict_sites_by_url'] = @@ -1541,8 +1530,7 @@ class CrawlComponent extends Component implements CrawlConstants PageProcessor::$indexed_file_types; } $loaded = false; - if (isset($_REQUEST['load_option']) && - $_REQUEST['load_option'] > 0) { + if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] > 0) { if ($_REQUEST['load_option'] == 1) { $seed_loaded = $crawl_model->getSeedInfo(true); } else { @@ -1552,8 +1540,8 @@ class CrawlComponent extends Component implements CrawlConstants $timestamp, $machine_urls); } $copy_options = ["general" => ["page_recrawl_frequency", - "page_range_request", "max_description_len", "cache_pages", - 'summarizer_option'], + "page_range_request", "max_description_len", + "max_links_to_extract", "cache_pages", 'summarizer_option'], "indexed_file_types" => ["extensions"], "indexing_plugins" => ["plugins", "plugins_data"]]; foreach ($copy_options as $main_option => $sub_options) { @@ -1604,7 +1592,7 @@ class CrawlComponent extends Component implements CrawlConstants } if (isset($_REQUEST["max_links_to_extract"]) && in_array($_REQUEST["max_links_to_extract"], - $data['MAX_LINKS_VALUES'])) { + array_keys($data['MAX_LINKS_VALUES']))) { $seed_info["general"]["max_links_to_extract"] = $_REQUEST["max_links_to_extract"]; } @@ -1644,7 +1632,7 @@ class CrawlComponent extends Component implements CrawlConstants } $data['MAX_LEN'] = $seed_info["general"]["max_description_len"]; $data['MAX_LINKS_TO_EXTRACT'] = - $seed_info["general"]["MAX_LINKS_TO_EXTRACT"] ?? + $seed_info["general"]["max_links_to_extract"] ?? C\MAX_LINKS_TO_EXTRACT; $data['INDEXING_PLUGINS'] = []; $included_plugins = []; @@ -1964,7 +1952,7 @@ class CrawlComponent extends Component implements CrawlConstants $processor_name = C\NS_PROCESSORS . $processor_name; $page_processor = new $processor_name($plugin_processors, $seed_info["general"]["max_description_len"], - $seed_info["general"]["max_links_to_extract"], + -1, $seed_info["general"]["summarizer_option"]); set_error_handler(null); if (L\generalIsA($processor_name, C\NS_PROCESSORS. @@ -1993,7 +1981,8 @@ class CrawlComponent extends Component implements CrawlConstants if ($processor_name != C\NS_PROCESSORS . "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) { $doc_info[self::LINKS] = UrlParser::pruneLinks( - $doc_info[self::LINKS]); + $doc_info[self::LINKS], + $seed_info["general"]["max_links_to_extract"]); } foreach ($doc_info as $key => $value) { $site[$key] = $value; diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 5e748b8b6..ada6af60c 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -556,12 +556,10 @@ class Fetcher implements CrawlConstants if (isset($this->plugin_processors[$page_processor])) { $processor_cache[$type] = new $page_processor( $this->plugin_processors[$page_processor], - $this->max_description_len, $this->max_links_to_extract, - $this->summarizer_option); + $this->max_description_len, -1, $this->summarizer_option); } else { $processor_cache[$type] = new $page_processor([], - $this->max_description_len, $this->max_links_to_extract, - $this->summarizer_option); + $this->max_description_len, -1, $this->summarizer_option); } if (L\generalIsA($page_processor, $text_processor)) { $processor_cache[$type]->text_data = true; @@ -1630,6 +1628,9 @@ class Fetcher implements CrawlConstants if (isset($info[self::MAX_DESCRIPTION_LEN])) { $this->max_description_len = $info[self::MAX_DESCRIPTION_LEN]; } + if (isset($info[self::MAX_LINKS_TO_EXTRACT])) { + $this->max_links_to_extract = $info[self::MAX_LINKS_TO_EXTRACT]; + } } /** * Prepare an array of up to NUM_MULTI_CURL_PAGES' worth of sites to be @@ -2314,10 +2315,9 @@ class Fetcher implements CrawlConstants "d" . $this->allow_disallow_cache_time); } /** - * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT * This method attempts to cull from the doc_info struct the - * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing - * links of filetype or sites the crawler is forbidden from crawl. + * best $this->max_links_to_extract. Currently, this is done by first + * removing links of filetype or sites the crawler is forbidden from crawl. * Then a crude estimate of the information contained in the links test: * strlen(gzip(text)) is used to extract the best remaining links. * @@ -2366,7 +2366,8 @@ class Fetcher implements CrawlConstants } $links[$url] = $url_info; } - $doc_info[$field] = UrlParser::pruneLinks($links); + $doc_info[$field] = UrlParser::pruneLinks($links, + $this->max_links_to_extract); } /** * Copies fields from the array of site data to the $i indexed diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 5f6a8391b..8c151aa4b 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -168,6 +168,11 @@ class QueueServer implements CrawlConstants, Join * @var int */ public $max_description_len; + /** + * Maximum number of urls to extract from a single document + * @var int + */ + public $max_links_to_extract; /** * Number of days between resets of the page url filter * If nonpositive, then never reset filter @@ -324,6 +329,7 @@ class QueueServer implements CrawlConstants, Join "sleep_duration" => self::SLEEP_DURATION, "robots_txt" => self::ROBOTS_TXT, "max_description_len" => self::MAX_DESCRIPTION_LEN, + "max_links_to_extract" => self::MAX_LINKS_TO_EXTRACT, "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY, "indexed_file_types" => self::INDEXED_FILE_TYPES, "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL, @@ -369,6 +375,7 @@ class QueueServer implements CrawlConstants, Join $this->page_recrawl_frequency = C\PAGE_RECRAWL_FREQUENCY; $this->page_range_request = C\PAGE_RANGE_REQUEST; $this->max_description_len = C\MAX_DESCRIPTION_LEN; + $this->max_links_to_extract = C\MAX_LINKS_TO_EXTRACT; $this->server_type = self::BOTH; $this->indexing_plugins = []; $this->indexing_plugins_data = []; @@ -806,7 +813,8 @@ class QueueServer implements CrawlConstants, Join $this->processEtagExpires(); } $count = $this->web_queue->to_crawl_queue->count; - $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP); + $max_links = max(C\MAX_LINKS_TO_EXTRACT, + $this->max_links_to_extract); if ($count < C\NUM_URLS_QUEUE_RAM - C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) { $this->processQueueUrls(); @@ -1562,23 +1570,9 @@ class QueueServer implements CrawlConstants, Join L\crawlLog("...none."); return; } - $updatable_info = [ - "repeat_type" =>self::REPEAT_TYPE, - "sleep_start" =>self::SLEEP_START, - "sleep_duration" =>self::SLEEP_DURATION, - "robots_txt" =>self::ROBOTS_TXT, - "page_range_request" => self::PAGE_RANGE_REQUEST, - "max_description_len" => self::MAX_DESCRIPTION_LEN, - "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY, - "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL, - "cache_pages" => self::CACHE_PAGES, - "allowed_sites" => self::ALLOWED_SITES, - "disallowed_sites" => self::DISALLOWED_SITES, - "page_rules" => self::PAGE_RULES, - "indexed_file_types" => self::INDEXED_FILE_TYPES, - "indexing_plugins" => self::INDEXING_PLUGINS, - "indexing_plugins_data" => self::INDEXING_PLUGINS_DATA, - ]; + $updatable_info = self::$info_parameter_map; + unset($updatable_info["crawl_order"], $updatable_info["crawl_type"], + $updatable_info["crawl_index"], $updatable_info["crawl_pages"]); $keys = array_keys($updatable_info); $archive_info = $index_archive_class::getArchiveInfo($dir); $index_info = unserialize($archive_info['DESCRIPTION']); @@ -1852,7 +1846,7 @@ class QueueServer implements CrawlConstants, Join $num = 0; $bad = false; $max_batch_sites_and_links = C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * - (max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP) + 1); + (max(C\MAX_LINKS_TO_EXTRACT, $this->max_links_to_extract) + 1); while($pos < $len_urls && $num <= $max_batch_sites_and_links) { L\crawlTimeoutLog("..Indexer still processing index data at " . "position %s of out of %s", $pos, $len_urls); @@ -2434,6 +2428,7 @@ class QueueServer implements CrawlConstants, Join $sites[self::INDEXING_PLUGINS_DATA] = $this->indexing_plugins_data; $sites[self::PAGE_RANGE_REQUEST] = $this->page_range_request; $sites[self::MAX_DESCRIPTION_LEN] = $this->max_description_len; + $sites[self::MAX_LINKS_TO_EXTRACT] = $this->max_links_to_extract; $sites[self::POST_MAX_SIZE] = L\metricToInt(ini_get("post_max_size")); $sites[self::SITES] = []; return base64_encode(serialize($sites)) . "\n"; @@ -2479,7 +2474,7 @@ class QueueServer implements CrawlConstants, Join number of slots */ $num_waiting_urls = 0; - $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP); + $max_links = max(C\MAX_LINKS_TO_EXTRACT, $this->max_links_to_extract); $max_queue_size = C\NUM_URLS_QUEUE_RAM - C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links; $examined_count = 0; @@ -2813,7 +2808,8 @@ class QueueServer implements CrawlConstants, Join "Time failing to make a fetch batch:" . L\changeInMicrotime($start_time)." . Loop properties:$i $count". " $num_deletes urls were deleted in failed attempt."); - $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP); + $max_links = max(C\MAX_LINKS_TO_EXTRACT, + $this->max_links_to_extract); if ($num_deletes < 5 && $i >= $count && $count >= C\NUM_URLS_QUEUE_RAM - C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) { diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index f28f96c94..732b9014a 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -229,7 +229,7 @@ interface CrawlConstants const CENTROID_SUMMARIZER = 'dl'; const SUMMARIZER_OPTION = 'dm'; const WORD_CLOUD = 'dn'; - const THESAURUS_SCORE ='do'; + const MAX_LINKS_TO_EXTRACT = 'do'; const IS_GOPHER_URL = "dp"; const MINIMUM_FETCH_LOOP_TIME = "dq"; const IMAGE_LINK = "dr"; diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index 82a39289b..dff09f343 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -934,17 +934,18 @@ class UrlParser * @return array $out_links extracted from $links accodring to the * description above. */ - public static function pruneLinks($links, $max_links = C\MAX_LINKS_PER_PAGE) + public static function pruneLinks($links, $max_links = + C\MAX_LINKS_TO_EXTRACT) { $consonants = "bcdfghjklmnpqrstvyz"; $vowels = "aeiouy"; $digit_consonants = "([$consonants]|\d)"; $digit_vowels = "([$vowels]|\d)"; - if (count($links) <= C\MAX_LINKS_PER_PAGE) { + if ($max_links <= 0 || count($links) <= $max_links) { return $links; } $info_link = []; - // choose the MAX_LINKS_PER_PAGE many pages with most info (crude) + // choose the $max_links many pages with most info (crude) foreach ($links as $url => $text) { $text = (is_string($text)) ? $text : ""; $terms = preg_split("/\s+|\-|\_|\~/", $text); @@ -961,7 +962,7 @@ class UrlParser } arsort($info_link); $link_urls = array_keys(array_slice($info_link, 0, - C\MAX_LINKS_PER_PAGE)); + $max_links)); $out_links = []; foreach ($link_urls as $url) { $out_links[$url] = $links[$url]; diff --git a/src/library/media_jobs/AnalyticsJob.php b/src/library/media_jobs/AnalyticsJob.php index a9eea2b1a..cdc305693 100644 --- a/src/library/media_jobs/AnalyticsJob.php +++ b/src/library/media_jobs/AnalyticsJob.php @@ -35,6 +35,7 @@ use seekquarry\yioop\library as L; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\UrlParser; use seekquarry\yioop\library\processors\PageProcessor; +use seekquarry\yioop\models\CrawlModel; use seekquarry\yioop\models\ImpressionModel; use seekquarry\yioop\models\MachineModel; use seekquarry\yioop\models\PhraseModel; @@ -70,6 +71,12 @@ class AnalyticsJob extends MediaJob * @var object */ public $machine_model; + /** + * Used to get crawl seed info + * + * @var object + */ + public $crawl_model; /** * For size and time distributions the number of times the miminal * recorded interval (DOWNLOAD_SIZE_INTERVAL for size) to check for @@ -92,6 +99,7 @@ class AnalyticsJob extends MediaJob $this->impression_model = new ImpressionModel(); $this->phrase_model = new PhraseModel(); $this->machine_model = new MachineModel(); + $this->crawl_model = new CrawlModel(); PageProcessor::initializeIndexedFileTypes(); } /** @@ -165,6 +173,8 @@ class AnalyticsJob extends MediaJob L\crawlLog("Starting to compute statistics for timestamp index ". $data["TIMESTAMP"]); $machine_urls = $this->machine_model->getQueueServerUrls(); + $seed_info = $this->crawl_model->getCrawlSeedInfo($data["TIMESTAMP"], + $machine_urls); $num_machines = count($machine_urls); if ($num_machines < 1 || ($num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0]))) { @@ -249,7 +259,9 @@ class AnalyticsJob extends MediaJob for ($i = 0; $i <= self::NUM_TIMES_INTERVAL; $i++) { $queries["DNS"][] = $i * C\DOWNLOAD_TIME_INTERVAL; } - for ($i = 0; $i <= C\MAX_LINKS_PER_SITEMAP; $i++) { + for ($i = 0; $i <=max( + $seed_info["general"]['max_links_to_extract'] ?? 0, + C\MAX_LINKS_TO_EXTRACT); $i++) { $queries["NUMLINKS"][] = $i; } $date = date("Y"); diff --git a/src/library/processors/DocxProcessor.php b/src/library/processors/DocxProcessor.php index e58393112..b23d5b898 100644 --- a/src/library/processors/DocxProcessor.php +++ b/src/library/processors/DocxProcessor.php @@ -134,7 +134,8 @@ class DocxProcessor extends TextProcessor foreach ($relations as $relation) { if (strcmp( $relation->getAttribute('Type'), $hyperlink) == 0 ) { - if ($i < self::$max_links_to_extract) { + if (self::$max_links_to_extract < 0 || + $i < self::$max_links_to_extract) { $link = $relation->getAttribute('Target'); $url = UrlParser::canonicalLink( $link, $site); diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php index 1d17c3f1a..5d9ae8566 100755 --- a/src/library/processors/HtmlProcessor.php +++ b/src/library/processors/HtmlProcessor.php @@ -442,7 +442,8 @@ class HtmlProcessor extends TextProcessor $i = 0; $hrefs = $xpath->evaluate("/html/body//a"); foreach ($hrefs as $href) { - if ($i < self::$max_links_to_extract) { + if (self::$max_links_to_extract < 0 || + $i < self::$max_links_to_extract) { $rel = $href->getAttribute("rel"); if ($rel == "" || !stristr($rel, "nofollow")) { $url = UrlParser::canonicalLink( @@ -502,7 +503,8 @@ class HtmlProcessor extends TextProcessor } $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe"); foreach ($frames as $frame) { - if ($i < self::$max_links_to_extract) { + if (self::$max_links_to_extract < 0 || + $i < self::$max_links_to_extract) { $url = UrlParser::canonicalLink( $frame->getAttribute('src'), $site); $len = strlen($url); @@ -520,7 +522,8 @@ class HtmlProcessor extends TextProcessor $imgs = $xpath->evaluate("/html/body//img[@alt]"); $i = 0; foreach ($imgs as $img) { - if ($i < self::$max_links_to_extract) { + if (self::$max_links_to_extract < 0 || + $i < self::$max_links_to_extract) { $alt = $img->getAttribute('alt'); if (strlen($alt) < 1) { continue; diff --git a/src/library/processors/PptxProcessor.php b/src/library/processors/PptxProcessor.php index 24f7b516c..d944ffddb 100644 --- a/src/library/processors/PptxProcessor.php +++ b/src/library/processors/PptxProcessor.php @@ -146,9 +146,10 @@ class PptxProcessor extends TextProcessor $xpath = new \DOMXPath($dom); $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp// p:txBody//a:p//a:r//a:rPr//a:hlinkClick"); - $i=0; + $i = 0; foreach ($paras as $para) { - if ($i < C\MAX_LINKS_TO_EXTRACT) { + if (self::$max_links_to_extract < 0 || + $i < self::$max_links_to_extract) { if (empty($para->parentNode->parentNode-> getElementsByTagName("t")->item(0)->nodeValue)) { continue; diff --git a/src/library/processors/RssProcessor.php b/src/library/processors/RssProcessor.php index 1a5485546..63d97cd9a 100644 --- a/src/library/processors/RssProcessor.php +++ b/src/library/processors/RssProcessor.php @@ -271,7 +271,8 @@ class RssProcessor extends TextProcessor $sites[$url] = $info; $i++; } - if ($i >= C\MAX_LINKS_TO_EXTRACT) { + if (self::$max_links_to_extract > 0 && + $i >= self::$max_links_to_extract) { break 2; } } diff --git a/src/library/processors/SitemapProcessor.php b/src/library/processors/SitemapProcessor.php index feb390477..2b629b436 100644 --- a/src/library/processors/SitemapProcessor.php +++ b/src/library/processors/SitemapProcessor.php @@ -119,7 +119,8 @@ class SitemapProcessor extends TextProcessor $sites[$url] = "From sitemap : " . $site . " .." . UrlParser::extractTextFromUrl($url); $i++; - if ($i > self::$max_links_to_extract) { + if (self::$max_links_to_extract > 0 && + $i >= self::$max_links_to_extract) { break 2; } } diff --git a/src/library/processors/TextProcessor.php b/src/library/processors/TextProcessor.php index 783888f02..f6752516f 100755 --- a/src/library/processors/TextProcessor.php +++ b/src/library/processors/TextProcessor.php @@ -186,7 +186,8 @@ class TextProcessor extends PageProcessor strlen($url) > 4) { $sites[$url] = UrlParser::extractTextFromUrl($url); $i++; - if ($i >= self::$max_links_to_extract) { + if (self::$max_links_to_extract > 0 && + $i >= self::$max_links_to_extract) { break; } } diff --git a/src/library/processors/XlsxProcessor.php b/src/library/processors/XlsxProcessor.php index 08655c332..386af34f4 100644 --- a/src/library/processors/XlsxProcessor.php +++ b/src/library/processors/XlsxProcessor.php @@ -226,7 +226,8 @@ class XlsxProcessor extends TextProcessor foreach ($relations as $relation) { if ( strcmp( $relation->getAttribute('Type'), $hyperlink) == 0 ) { - if ($i < self::$max_links_to_extract) { + if (self::$max_links_to_extract > 0 && + $i >= self::$max_links_to_extract) { $link = $relation->getAttribute('Target'); $url = UrlParser::canonicalLink( $link, $site); diff --git a/src/models/CrawlModel.php b/src/models/CrawlModel.php index 304af7d07..e0476f178 100755 --- a/src/models/CrawlModel.php +++ b/src/models/CrawlModel.php @@ -498,31 +498,17 @@ class CrawlModel extends ParallelModel ; Crawl configuration file ; EOT; - if (!isset($info['general']['page_range_request'])) { - $info['general']['page_range_request'] = C\PAGE_RANGE_REQUEST; - } - if (!isset($info['general']['max_depth'])) { - $info['general']['max_depth'] = -1; - } - if (!isset($info['general']['repeat_type'])) { - $info['general']['repeat_type'] = -1; - } - if (!isset($info['general']['sleep_start'])) { - $info['general']['sleep_start'] = "00:00"; - } + $info['general']['page_range_request'] ??= C\PAGE_RANGE_REQUEST; + $info['general']['max_depth'] ??= -1; + $info['general']['repeat_type'] ??= -1; + $info['general']['sleep_start'] ??= "00:00"; if (!isset($info['general']['sleep_duration'])) { $info['general']['sleep_start'] = -1; } - if (!isset($info['general']['robots_txt'])) { - $info['general']['robots_txt'] = C\ALWAYS_FOLLOW_ROBOTS; - } - if (!isset($info['general']['page_recrawl_frequency'])) { - $info['general']['page_recrawl_frequency'] = - C\PAGE_RECRAWL_FREQUENCY; - } - if (!isset($info['general']['max_description_len'])) { - $info['general']['max_description_len'] = C\MAX_DESCRIPTION_LEN; - } + $info['general']['robots_txt'] ??= C\ALWAYS_FOLLOW_ROBOTS; + $info['general']['page_recrawl_frequency'] ??= C\PAGE_RECRAWL_FREQUENCY; + $info['general']['max_description_len'] ??= C\MAX_DESCRIPTION_LEN; + $info['general']['max_links_to_extract'] ??= C\MAX_LINKS_TO_EXTRACT; $n[] = '[general]'; $n[] = "crawl_order = '" . $info['general']['crawl_order'] . "';"; $n[] = "summarizer_option = '" . @@ -543,6 +529,8 @@ EOT; $info['general']['page_range_request']."';"; $n[] = "max_description_len = '". $info['general']['max_description_len']."';"; + $n[] = "max_links_to_extract = '". + $info['general']['max_links_to_extract']."';"; $bool_string = ($info['general']['cache_pages']) ? "true" : "false"; $n[] = "cache_pages = $bool_string;"; @@ -672,6 +660,8 @@ EOT; C\PAGE_RANGE_REQUEST], "max_description_len" => [self::MAX_DESCRIPTION_LEN, C\MAX_DESCRIPTION_LEN], + "max_links_to_extract" => [self::MAX_LINKS_TO_EXTRACT, + C\MAX_LINKS_TO_EXTRACT], ]; foreach ($general_params as $param => $info) { $seed_info['general'][$param] = (isset($index_info[$info[0]])) ?