Finish update for UI to allow control of max links to extract, a=chris

Chris Pollett [2020-10-18 03:Oct:th]

Finish update for UI to allow control of max links to extract, a=chris

Filename
src/configs/Config.php
src/configs/default_crawl.ini
src/controllers/components/CrawlComponent.php
src/executables/Fetcher.php
src/executables/QueueServer.php
src/library/CrawlConstants.php
src/library/UrlParser.php
src/library/media_jobs/AnalyticsJob.php
src/library/processors/DocxProcessor.php
src/library/processors/HtmlProcessor.php
src/library/processors/PptxProcessor.php
src/library/processors/RssProcessor.php
src/library/processors/SitemapProcessor.php
src/library/processors/TextProcessor.php
src/library/processors/XlsxProcessor.php
src/models/CrawlModel.php

diff --git a/src/configs/Config.php b/src/configs/Config.php
index 5abd4ad22..960ec2eef 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -744,12 +744,8 @@ nsconddefine('NUM_DOCS_PER_GENERATION', MEMORY_PROFILE * 10000);
 nsconddefine('PRECISION', 10);
 /** maximum number of links to extract from a page on an initial pass*/
 nsconddefine('MAX_LINKS_TO_EXTRACT', MEMORY_PROFILE * 80);
-/** maximum number of links to keep after initial extraction*/
-nsconddefine('MAX_LINKS_PER_PAGE', 50);
 /** Estimate of the average number of links per page a document has*/
 nsconddefine('AVG_LINKS_PER_PAGE', 24);
-/** maximum number of links to consider from a sitemap page */
-nsconddefine('MAX_LINKS_PER_SITEMAP', MEMORY_PROFILE * 80);
 /**  minimum char length of link text before gets its own document */
 nsconddefine('MIN_LINKS_TEXT_CHARS', 3);
 /**  maximum number of chars for link text to use for any given url on
diff --git a/src/configs/default_crawl.ini b/src/configs/default_crawl.ini
index 93ed77ca8..d7cc7462c 100644
--- a/src/configs/default_crawl.ini
+++ b/src/configs/default_crawl.ini
@@ -36,6 +36,7 @@ arc_type = "";
 page_recrawl_frequency = '-1';
 page_range_request = '100000';
 max_description_len = '10000';
+max_links_to_extract = '50';
 cache_pages = true;
 restrict_sites_by_url = false;

diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 040324009..a9dee6c2e 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -698,36 +698,29 @@ class CrawlComponent extends Component implements CrawlConstants
         $parent = $this->parent;
         $crawl_params[self::CRAWL_TYPE] = $seed_info['general']['crawl_type'];
         $crawl_params[self::CRAWL_INDEX] =
-            (isset($seed_info['general']['crawl_index'])) ?
-            $seed_info['general']['crawl_index'] : '';
+            $seed_info['general']['crawl_index'] ?? '';
         $crawl_params[self::CHANNEL] =
-            (isset($seed_info['general']['channel'])) ?
-            $seed_info['general']['channel'] : '';
+            $seed_info['general']['channel'] ?? '';
         $crawl_params[self::ARC_DIR]=
-            (isset($seed_info['general']['arc_dir'])) ?
-            $seed_info['general']['arc_dir'] : '';
+            $seed_info['general']['arc_dir'] ?? '';
         $crawl_params[self::ARC_TYPE] =
-            (isset($seed_info['general']['arc_type'])) ?
-            $seed_info['general']['arc_type'] : '';
+            $seed_info['general']['arc_type'] ?? '';
         $crawl_params[self::CACHE_PAGES] =
-            (isset($seed_info['general']['cache_pages'])) ?
-            intval($seed_info['general']['cache_pages']) :
-            true;
+            intval($seed_info['general']['cache_pages']) ?? true;
         $crawl_params[self::PAGE_RANGE_REQUEST] =
-            (isset($seed_info['general']['page_range_request'])) ?
-            intval($seed_info['general']['page_range_request']) :
+            intval($seed_info['general']['page_range_request']) ??
             C\PAGE_RANGE_REQUEST;
         $crawl_params[self::MAX_DESCRIPTION_LEN] =
-            (isset($seed_info['general']['max_description_len'])) ?
-            intval($seed_info['general']['max_description_len']) :
+            intval($seed_info['general']['max_description_len']) ??
             C\MAX_DESCRIPTION_LEN;
+        $crawl_params[self::MAX_LINKS_TO_EXTRACT] =
+            intval($seed_info['general']['max_links_to_extract']) ??
+            C\MAX_LINKS_TO_EXTRACT;
         $crawl_params[self::PAGE_RECRAWL_FREQUENCY] =
-            (isset($seed_info['general']['page_recrawl_frequency'])) ?
-            intval($seed_info['general']['page_recrawl_frequency']) :
+            intval($seed_info['general']['page_recrawl_frequency']) ??
             C\PAGE_RECRAWL_FREQUENCY;
         $crawl_params[self::TO_CRAWL] = $seed_info['seed_sites']['url'];
-        $crawl_params[self::CRAWL_ORDER] =
-            $seed_info['general']['crawl_order'];
+        $crawl_params[self::CRAWL_ORDER] = $seed_info['general']['crawl_order'];
         $crawl_params[self::MAX_DEPTH] = $seed_info['general']['max_depth'];
         $crawl_params[self::REPEAT_TYPE] = $seed_info['general']['repeat_type'];
         $crawl_params[self::SLEEP_START] = $seed_info['general']['sleep_start'];
@@ -737,11 +730,9 @@ class CrawlComponent extends Component implements CrawlConstants
         $crawl_params[self::RESTRICT_SITES_BY_URL] =
             $seed_info['general']['restrict_sites_by_url'];
         $crawl_params[self::ALLOWED_SITES] =
-            isset($seed_info['allowed_sites']['url']) ?
-            $seed_info['allowed_sites']['url'] : [];
+            $seed_info['allowed_sites']['url'] ?? [];
         $crawl_params[self::DISALLOWED_SITES] =
-            isset($seed_info['disallowed_sites']['url']) ?
-            $seed_info['disallowed_sites']['url'] : [];
+            $seed_info['disallowed_sites']['url'] ?? [];
         if (isset($seed_info['indexed_file_types']['extensions'])) {
             $crawl_params[self::INDEXED_FILE_TYPES] =
                 $seed_info['indexed_file_types']['extensions'];
@@ -765,8 +756,7 @@ class CrawlComponent extends Component implements CrawlConstants
                 $seed_info['indexing_plugins']['plugins'];
         }
         $crawl_params[self::PAGE_RULES] =
-            isset($seed_info['page_rules']['rule']) ?
-            $seed_info['page_rules']['rule'] : [];
+            $seed_info['page_rules']['rule'] ?? [];
     }
     /**
      * Called from @see manageCrawls to edit the parameters for the next
@@ -1064,7 +1054,6 @@ class CrawlComponent extends Component implements CrawlConstants
             in_array($_REQUEST['robots_txt'],
             array_keys($data['robots_txt_behaviors']))) {
             $seed_info['general']['robots_txt'] = $_REQUEST['robots_txt'];
-            echo $seed_info['general']['robots_txt'];
             $update_flag = true;
         }
         $data['restrict_sites_by_url'] =
@@ -1541,8 +1530,7 @@ class CrawlComponent extends Component implements CrawlConstants
                 PageProcessor::$indexed_file_types;
         }
         $loaded = false;
-        if (isset($_REQUEST['load_option']) &&
-            $_REQUEST['load_option'] > 0) {
+        if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] > 0) {
             if ($_REQUEST['load_option'] == 1) {
                 $seed_loaded = $crawl_model->getSeedInfo(true);
             } else {
@@ -1552,8 +1540,8 @@ class CrawlComponent extends Component implements CrawlConstants
                     $timestamp, $machine_urls);
             }
             $copy_options = ["general" => ["page_recrawl_frequency",
-                "page_range_request", "max_description_len", "cache_pages",
-                'summarizer_option'],
+                "page_range_request", "max_description_len",
+                "max_links_to_extract", "cache_pages", 'summarizer_option'],
                 "indexed_file_types" => ["extensions"],
                 "indexing_plugins" => ["plugins", "plugins_data"]];
             foreach ($copy_options as $main_option => $sub_options) {
@@ -1604,7 +1592,7 @@ class CrawlComponent extends Component implements CrawlConstants
             }
             if (isset($_REQUEST["max_links_to_extract"]) &&
                 in_array($_REQUEST["max_links_to_extract"],
-                $data['MAX_LINKS_VALUES'])) {
+                array_keys($data['MAX_LINKS_VALUES']))) {
                 $seed_info["general"]["max_links_to_extract"] =
                     $_REQUEST["max_links_to_extract"];
             }
@@ -1644,7 +1632,7 @@ class CrawlComponent extends Component implements CrawlConstants
         }
         $data['MAX_LEN'] = $seed_info["general"]["max_description_len"];
         $data['MAX_LINKS_TO_EXTRACT'] =
-            $seed_info["general"]["MAX_LINKS_TO_EXTRACT"] ??
+            $seed_info["general"]["max_links_to_extract"] ??
             C\MAX_LINKS_TO_EXTRACT;
         $data['INDEXING_PLUGINS'] = [];
         $included_plugins = [];
@@ -1964,7 +1952,7 @@ class CrawlComponent extends Component implements CrawlConstants
             $processor_name = C\NS_PROCESSORS . $processor_name;
             $page_processor = new $processor_name($plugin_processors,
                 $seed_info["general"]["max_description_len"],
-                $seed_info["general"]["max_links_to_extract"],
+                -1,
                 $seed_info["general"]["summarizer_option"]);
             set_error_handler(null);
             if (L\generalIsA($processor_name, C\NS_PROCESSORS.
@@ -1993,7 +1981,8 @@ class CrawlComponent extends Component implements CrawlConstants
             if ($processor_name != C\NS_PROCESSORS . "RobotProcessor" &&
                 !isset($doc_info[self::JUST_METAS])) {
                 $doc_info[self::LINKS] = UrlParser::pruneLinks(
-                    $doc_info[self::LINKS]);
+                    $doc_info[self::LINKS],
+                    $seed_info["general"]["max_links_to_extract"]);
             }
             foreach ($doc_info as $key => $value) {
                 $site[$key] = $value;
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 5e748b8b6..ada6af60c 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -556,12 +556,10 @@ class Fetcher implements CrawlConstants
         if (isset($this->plugin_processors[$page_processor])) {
             $processor_cache[$type] = new $page_processor(
                 $this->plugin_processors[$page_processor],
-                $this->max_description_len, $this->max_links_to_extract,
-                $this->summarizer_option);
+                $this->max_description_len, -1, $this->summarizer_option);
         } else {
             $processor_cache[$type] = new $page_processor([],
-                $this->max_description_len, $this->max_links_to_extract,
-                $this->summarizer_option);
+                $this->max_description_len, -1, $this->summarizer_option);
         }
         if (L\generalIsA($page_processor, $text_processor)) {
             $processor_cache[$type]->text_data = true;
@@ -1630,6 +1628,9 @@ class Fetcher implements CrawlConstants
         if (isset($info[self::MAX_DESCRIPTION_LEN])) {
             $this->max_description_len = $info[self::MAX_DESCRIPTION_LEN];
         }
+        if (isset($info[self::MAX_LINKS_TO_EXTRACT])) {
+            $this->max_links_to_extract = $info[self::MAX_LINKS_TO_EXTRACT];
+        }
     }
     /**
      * Prepare an array of up to NUM_MULTI_CURL_PAGES' worth of sites to be
@@ -2314,10 +2315,9 @@ class Fetcher implements CrawlConstants
             "d" . $this->allow_disallow_cache_time);
     }
     /**
-     * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
      * This method attempts to cull from the doc_info struct the
-     * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing
-     * links of filetype or sites the crawler is forbidden from crawl.
+     * best $this->max_links_to_extract. Currently, this is done by first
+     * removing links of filetype or sites the crawler is forbidden from crawl.
      * Then a crude estimate of the information contained in the links test:
      * strlen(gzip(text)) is used to extract the best remaining links.
      *
@@ -2366,7 +2366,8 @@ class Fetcher implements CrawlConstants
             }
             $links[$url] = $url_info;
         }
-        $doc_info[$field] = UrlParser::pruneLinks($links);
+        $doc_info[$field] = UrlParser::pruneLinks($links,
+            $this->max_links_to_extract);
     }
     /**
      * Copies fields from the array of site data to the $i indexed
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 5f6a8391b..8c151aa4b 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -168,6 +168,11 @@ class QueueServer implements CrawlConstants, Join
      * @var int
      */
     public $max_description_len;
+    /**
+     * Maximum number of urls to extract from a single document
+     * @var int
+     */
+    public $max_links_to_extract;
     /**
      * Number of days between resets of the page url filter
      * If nonpositive, then never reset filter
@@ -324,6 +329,7 @@ class QueueServer implements CrawlConstants, Join
         "sleep_duration" => self::SLEEP_DURATION,
         "robots_txt" => self::ROBOTS_TXT,
         "max_description_len" => self::MAX_DESCRIPTION_LEN,
+        "max_links_to_extract" => self::MAX_LINKS_TO_EXTRACT,
         "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY,
         "indexed_file_types" => self::INDEXED_FILE_TYPES,
         "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL,
@@ -369,6 +375,7 @@ class QueueServer implements CrawlConstants, Join
         $this->page_recrawl_frequency = C\PAGE_RECRAWL_FREQUENCY;
         $this->page_range_request = C\PAGE_RANGE_REQUEST;
         $this->max_description_len = C\MAX_DESCRIPTION_LEN;
+        $this->max_links_to_extract = C\MAX_LINKS_TO_EXTRACT;
         $this->server_type = self::BOTH;
         $this->indexing_plugins = [];
         $this->indexing_plugins_data = [];
@@ -806,7 +813,8 @@ class QueueServer implements CrawlConstants, Join
                     $this->processEtagExpires();
                 }
                 $count = $this->web_queue->to_crawl_queue->count;
-                $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP);
+                $max_links = max(C\MAX_LINKS_TO_EXTRACT,
+                    $this->max_links_to_extract);
                 if ($count < C\NUM_URLS_QUEUE_RAM -
                     C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) {
                     $this->processQueueUrls();
@@ -1562,23 +1570,9 @@ class QueueServer implements CrawlConstants, Join
             L\crawlLog("...none.");
             return;
         }
-        $updatable_info = [
-            "repeat_type" =>self::REPEAT_TYPE,
-            "sleep_start" =>self::SLEEP_START,
-            "sleep_duration" =>self::SLEEP_DURATION,
-            "robots_txt" =>self::ROBOTS_TXT,
-            "page_range_request" => self::PAGE_RANGE_REQUEST,
-            "max_description_len" => self::MAX_DESCRIPTION_LEN,
-            "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY,
-            "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL,
-            "cache_pages" => self::CACHE_PAGES,
-            "allowed_sites" => self::ALLOWED_SITES,
-            "disallowed_sites" => self::DISALLOWED_SITES,
-            "page_rules" => self::PAGE_RULES,
-            "indexed_file_types" => self::INDEXED_FILE_TYPES,
-            "indexing_plugins" => self::INDEXING_PLUGINS,
-            "indexing_plugins_data" => self::INDEXING_PLUGINS_DATA,
-        ];
+        $updatable_info = self::$info_parameter_map;
+        unset($updatable_info["crawl_order"], $updatable_info["crawl_type"],
+            $updatable_info["crawl_index"], $updatable_info["crawl_pages"]);
         $keys = array_keys($updatable_info);
         $archive_info = $index_archive_class::getArchiveInfo($dir);
         $index_info = unserialize($archive_info['DESCRIPTION']);
@@ -1852,7 +1846,7 @@ class QueueServer implements CrawlConstants, Join
         $num = 0;
         $bad = false;
         $max_batch_sites_and_links = C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER *
-            (max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP) + 1);
+            (max(C\MAX_LINKS_TO_EXTRACT, $this->max_links_to_extract) + 1);
         while($pos < $len_urls && $num <= $max_batch_sites_and_links) {
             L\crawlTimeoutLog("..Indexer still processing index data at " .
                 "position %s of out of %s", $pos, $len_urls);
@@ -2434,6 +2428,7 @@ class QueueServer implements CrawlConstants, Join
         $sites[self::INDEXING_PLUGINS_DATA] =  $this->indexing_plugins_data;
         $sites[self::PAGE_RANGE_REQUEST] = $this->page_range_request;
         $sites[self::MAX_DESCRIPTION_LEN] = $this->max_description_len;
+        $sites[self::MAX_LINKS_TO_EXTRACT] = $this->max_links_to_extract;
         $sites[self::POST_MAX_SIZE] = L\metricToInt(ini_get("post_max_size"));
         $sites[self::SITES] = [];
         return base64_encode(serialize($sites)) . "\n";
@@ -2479,7 +2474,7 @@ class QueueServer implements CrawlConstants, Join
                 number of slots
         */
         $num_waiting_urls = 0;
-        $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP);
+        $max_links = max(C\MAX_LINKS_TO_EXTRACT, $this->max_links_to_extract);
         $max_queue_size =  C\NUM_URLS_QUEUE_RAM -
             C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links;
         $examined_count = 0;
@@ -2813,7 +2808,8 @@ class QueueServer implements CrawlConstants, Join
                 "Time failing to make a fetch batch:" .
                 L\changeInMicrotime($start_time)." . Loop properties:$i $count".
                 " $num_deletes urls were deleted in failed attempt.");
-            $max_links = max(C\MAX_LINKS_PER_PAGE, C\MAX_LINKS_PER_SITEMAP);
+            $max_links = max(C\MAX_LINKS_TO_EXTRACT,
+                $this->max_links_to_extract);
             if ($num_deletes < 5 && $i >= $count &&
                     $count >= C\NUM_URLS_QUEUE_RAM -
                     C\SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) {
diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php
index f28f96c94..732b9014a 100755
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@@ -229,7 +229,7 @@ interface CrawlConstants
     const CENTROID_SUMMARIZER = 'dl';
     const SUMMARIZER_OPTION = 'dm';
     const WORD_CLOUD = 'dn';
-    const THESAURUS_SCORE ='do';
+    const MAX_LINKS_TO_EXTRACT = 'do';
     const IS_GOPHER_URL = "dp";
     const MINIMUM_FETCH_LOOP_TIME = "dq";
     const IMAGE_LINK = "dr";
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 82a39289b..dff09f343 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -934,17 +934,18 @@ class UrlParser
      * @return array $out_links extracted from $links accodring to the
      *     description above.
      */
-    public static function pruneLinks($links, $max_links = C\MAX_LINKS_PER_PAGE)
+    public static function pruneLinks($links, $max_links =
+        C\MAX_LINKS_TO_EXTRACT)
     {
         $consonants = "bcdfghjklmnpqrstvyz";
         $vowels = "aeiouy";
         $digit_consonants = "([$consonants]|\d)";
         $digit_vowels = "([$vowels]|\d)";
-        if (count($links) <= C\MAX_LINKS_PER_PAGE) {
+        if ($max_links <= 0 || count($links) <= $max_links) {
             return $links;
         }
         $info_link = [];
-        // choose the MAX_LINKS_PER_PAGE many pages with most info (crude)
+        // choose the $max_links many pages with most info (crude)
         foreach ($links as $url => $text) {
             $text = (is_string($text)) ? $text : "";
             $terms = preg_split("/\s+|\-|\_|\~/", $text);
@@ -961,7 +962,7 @@ class UrlParser
         }
         arsort($info_link);
         $link_urls = array_keys(array_slice($info_link, 0,
-            C\MAX_LINKS_PER_PAGE));
+            $max_links));
         $out_links = [];
         foreach ($link_urls as $url) {
             $out_links[$url] = $links[$url];
diff --git a/src/library/media_jobs/AnalyticsJob.php b/src/library/media_jobs/AnalyticsJob.php
index a9eea2b1a..cdc305693 100644
--- a/src/library/media_jobs/AnalyticsJob.php
+++ b/src/library/media_jobs/AnalyticsJob.php
@@ -35,6 +35,7 @@ use seekquarry\yioop\library as L;
 use seekquarry\yioop\library\CrawlConstants;
 use seekquarry\yioop\library\UrlParser;
 use seekquarry\yioop\library\processors\PageProcessor;
+use seekquarry\yioop\models\CrawlModel;
 use seekquarry\yioop\models\ImpressionModel;
 use seekquarry\yioop\models\MachineModel;
 use seekquarry\yioop\models\PhraseModel;
@@ -70,6 +71,12 @@ class AnalyticsJob extends MediaJob
      * @var object
      */
     public $machine_model;
+    /**
+     * Used to get crawl seed info
+     *
+     * @var object
+     */
+    public $crawl_model;
     /**
      * For size and time distributions the number of times the miminal
      * recorded interval (DOWNLOAD_SIZE_INTERVAL for size) to check for
@@ -92,6 +99,7 @@ class AnalyticsJob extends MediaJob
         $this->impression_model = new ImpressionModel();
         $this->phrase_model = new PhraseModel();
         $this->machine_model = new MachineModel();
+        $this->crawl_model = new CrawlModel();
         PageProcessor::initializeIndexedFileTypes();
     }
     /**
@@ -165,6 +173,8 @@ class AnalyticsJob extends MediaJob
         L\crawlLog("Starting to compute statistics for timestamp index ".
             $data["TIMESTAMP"]);
         $machine_urls = $this->machine_model->getQueueServerUrls();
+        $seed_info = $this->crawl_model->getCrawlSeedInfo($data["TIMESTAMP"],
+            $machine_urls);
         $num_machines = count($machine_urls);
         if ($num_machines <  1 || ($num_machines ==  1 &&
             UrlParser::isLocalhostUrl($machine_urls[0]))) {
@@ -249,7 +259,9 @@ class AnalyticsJob extends MediaJob
         for ($i = 0; $i <= self::NUM_TIMES_INTERVAL; $i++) {
             $queries["DNS"][] = $i * C\DOWNLOAD_TIME_INTERVAL;
         }
-        for ($i = 0; $i <= C\MAX_LINKS_PER_SITEMAP; $i++) {
+        for ($i = 0; $i <=max(
+            $seed_info["general"]['max_links_to_extract'] ?? 0,
+            C\MAX_LINKS_TO_EXTRACT); $i++) {
             $queries["NUMLINKS"][] = $i;
         }
         $date = date("Y");
diff --git a/src/library/processors/DocxProcessor.php b/src/library/processors/DocxProcessor.php
index e58393112..b23d5b898 100644
--- a/src/library/processors/DocxProcessor.php
+++ b/src/library/processors/DocxProcessor.php
@@ -134,7 +134,8 @@ class DocxProcessor extends TextProcessor
             foreach ($relations as $relation) {
                 if (strcmp( $relation->getAttribute('Type'),
                     $hyperlink) == 0 ) {
-                    if ($i < self::$max_links_to_extract) {
+                    if (self::$max_links_to_extract < 0 ||
+                        $i < self::$max_links_to_extract) {
                         $link = $relation->getAttribute('Target');
                         $url = UrlParser::canonicalLink(
                             $link, $site);
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index 1d17c3f1a..5d9ae8566 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -442,7 +442,8 @@ class HtmlProcessor extends TextProcessor
         $i = 0;
         $hrefs = $xpath->evaluate("/html/body//a");
         foreach ($hrefs as $href) {
-            if ($i < self::$max_links_to_extract) {
+            if (self::$max_links_to_extract < 0 ||
+                $i < self::$max_links_to_extract) {
                 $rel = $href->getAttribute("rel");
                 if ($rel == "" || !stristr($rel, "nofollow")) {
                     $url = UrlParser::canonicalLink(
@@ -502,7 +503,8 @@ class HtmlProcessor extends TextProcessor
         }
         $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe");
         foreach ($frames as $frame) {
-            if ($i < self::$max_links_to_extract) {
+            if (self::$max_links_to_extract < 0 ||
+                $i < self::$max_links_to_extract) {
                 $url = UrlParser::canonicalLink(
                     $frame->getAttribute('src'), $site);
                 $len = strlen($url);
@@ -520,7 +522,8 @@ class HtmlProcessor extends TextProcessor
         $imgs = $xpath->evaluate("/html/body//img[@alt]");
         $i = 0;
         foreach ($imgs as $img) {
-            if ($i < self::$max_links_to_extract) {
+            if (self::$max_links_to_extract < 0 ||
+                $i < self::$max_links_to_extract) {
                 $alt = $img->getAttribute('alt');
                 if (strlen($alt) < 1) {
                     continue;
diff --git a/src/library/processors/PptxProcessor.php b/src/library/processors/PptxProcessor.php
index 24f7b516c..d944ffddb 100644
--- a/src/library/processors/PptxProcessor.php
+++ b/src/library/processors/PptxProcessor.php
@@ -146,9 +146,10 @@ class PptxProcessor extends TextProcessor
         $xpath = new \DOMXPath($dom);
         $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//
             p:txBody//a:p//a:r//a:rPr//a:hlinkClick");
-        $i=0;
+        $i = 0;
         foreach ($paras as $para) {
-            if ($i < C\MAX_LINKS_TO_EXTRACT) {
+            if (self::$max_links_to_extract < 0 ||
+                $i < self::$max_links_to_extract) {
                 if (empty($para->parentNode->parentNode->
                     getElementsByTagName("t")->item(0)->nodeValue)) {
                     continue;
diff --git a/src/library/processors/RssProcessor.php b/src/library/processors/RssProcessor.php
index 1a5485546..63d97cd9a 100644
--- a/src/library/processors/RssProcessor.php
+++ b/src/library/processors/RssProcessor.php
@@ -271,7 +271,8 @@ class RssProcessor extends TextProcessor
                     $sites[$url] = $info;
                     $i++;
                 }
-                if ($i >= C\MAX_LINKS_TO_EXTRACT) {
+                if (self::$max_links_to_extract > 0 &&
+                    $i >= self::$max_links_to_extract) {
                     break 2;
                 }
             }
diff --git a/src/library/processors/SitemapProcessor.php b/src/library/processors/SitemapProcessor.php
index feb390477..2b629b436 100644
--- a/src/library/processors/SitemapProcessor.php
+++ b/src/library/processors/SitemapProcessor.php
@@ -119,7 +119,8 @@ class SitemapProcessor extends TextProcessor
                 $sites[$url] = "From sitemap : " . $site . " .." .
                     UrlParser::extractTextFromUrl($url);
                 $i++;
-                if ($i > self::$max_links_to_extract) {
+                if (self::$max_links_to_extract > 0 &&
+                    $i >= self::$max_links_to_extract) {
                     break 2;
                 }
             }
diff --git a/src/library/processors/TextProcessor.php b/src/library/processors/TextProcessor.php
index 783888f02..f6752516f 100755
--- a/src/library/processors/TextProcessor.php
+++ b/src/library/processors/TextProcessor.php
@@ -186,7 +186,8 @@ class TextProcessor extends PageProcessor
                 strlen($url) > 4) {
                 $sites[$url] = UrlParser::extractTextFromUrl($url);
                 $i++;
-                if ($i >= self::$max_links_to_extract) {
+                if (self::$max_links_to_extract > 0 &&
+                    $i >= self::$max_links_to_extract) {
                     break;
                 }
             }
diff --git a/src/library/processors/XlsxProcessor.php b/src/library/processors/XlsxProcessor.php
index 08655c332..386af34f4 100644
--- a/src/library/processors/XlsxProcessor.php
+++ b/src/library/processors/XlsxProcessor.php
@@ -226,7 +226,8 @@ class XlsxProcessor extends TextProcessor
             foreach ($relations as $relation) {
                 if ( strcmp( $relation->getAttribute('Type'),
                     $hyperlink) == 0 ) {
-                    if ($i < self::$max_links_to_extract) {
+                    if (self::$max_links_to_extract > 0 &&
+                        $i >= self::$max_links_to_extract) {
                         $link = $relation->getAttribute('Target');
                         $url = UrlParser::canonicalLink(
                             $link, $site);
diff --git a/src/models/CrawlModel.php b/src/models/CrawlModel.php
index 304af7d07..e0476f178 100755
--- a/src/models/CrawlModel.php
+++ b/src/models/CrawlModel.php
@@ -498,31 +498,17 @@ class CrawlModel extends ParallelModel
 ; Crawl configuration file
 ;
 EOT;
-        if (!isset($info['general']['page_range_request'])) {
-            $info['general']['page_range_request'] = C\PAGE_RANGE_REQUEST;
-        }
-        if (!isset($info['general']['max_depth'])) {
-            $info['general']['max_depth'] = -1;
-        }
-        if (!isset($info['general']['repeat_type'])) {
-            $info['general']['repeat_type'] = -1;
-        }
-        if (!isset($info['general']['sleep_start'])) {
-            $info['general']['sleep_start'] = "00:00";
-        }
+        $info['general']['page_range_request'] ??= C\PAGE_RANGE_REQUEST;
+        $info['general']['max_depth'] ??= -1;
+        $info['general']['repeat_type'] ??= -1;
+        $info['general']['sleep_start'] ??= "00:00";
         if (!isset($info['general']['sleep_duration'])) {
             $info['general']['sleep_start'] = -1;
         }
-        if (!isset($info['general']['robots_txt'])) {
-            $info['general']['robots_txt'] = C\ALWAYS_FOLLOW_ROBOTS;
-        }
-        if (!isset($info['general']['page_recrawl_frequency'])) {
-            $info['general']['page_recrawl_frequency'] =
-                C\PAGE_RECRAWL_FREQUENCY;
-        }
-        if (!isset($info['general']['max_description_len'])) {
-            $info['general']['max_description_len'] = C\MAX_DESCRIPTION_LEN;
-        }
+        $info['general']['robots_txt'] ??= C\ALWAYS_FOLLOW_ROBOTS;
+        $info['general']['page_recrawl_frequency'] ??= C\PAGE_RECRAWL_FREQUENCY;
+        $info['general']['max_description_len'] ??= C\MAX_DESCRIPTION_LEN;
+        $info['general']['max_links_to_extract'] ??= C\MAX_LINKS_TO_EXTRACT;
         $n[] = '[general]';
         $n[] = "crawl_order = '" . $info['general']['crawl_order'] . "';";
         $n[] = "summarizer_option = '" .
@@ -543,6 +529,8 @@ EOT;
             $info['general']['page_range_request']."';";
         $n[] = "max_description_len = '".
             $info['general']['max_description_len']."';";
+        $n[] = "max_links_to_extract = '".
+            $info['general']['max_links_to_extract']."';";
         $bool_string =
             ($info['general']['cache_pages']) ? "true" : "false";
         $n[] = "cache_pages = $bool_string;";
@@ -672,6 +660,8 @@ EOT;
                     C\PAGE_RANGE_REQUEST],
                 "max_description_len" => [self::MAX_DESCRIPTION_LEN,
                     C\MAX_DESCRIPTION_LEN],
+                "max_links_to_extract" => [self::MAX_LINKS_TO_EXTRACT,
+                    C\MAX_LINKS_TO_EXTRACT],
             ];
             foreach ($general_params as $param => $info) {
                 $seed_info['general'][$param] = (isset($index_info[$info[0]])) ?

ViewGit