More PDB parameter tweaks, fix crawlTimeout message for buildInvertedIndex, a=chris

Chris Pollett [2021-09-08 01:Sep:th]
More PDB parameter tweaks, fix crawlTimeout message for buildInvertedIndex, a=chris
Filename
src/library/IndexDocumentBundle.php
src/library/PartitionDocumentBundle.php
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 62d9b7f86..288c64526 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -187,9 +187,6 @@ class IndexDocumentBundle implements CrawlConstants
             $num_docs_per_partition,
             PartitionDocumentBundle::PARTITION_SIZE_THRESHOLD,
             C\NS_COMPRESSORS . "GzipCompressor");
-        if (!$read_only_archive) {
-            $this->documents->index_cache_size = 1;
-        }
         $this->doc_map_tools = new PackedTableTools([
             "PRIMARY KEY" => ["DOC_KEYS", 24], "POS" => "INT",
             "SCORE" => "FLOAT"], C\NS_COMPRESSORS . "GzipCompressor");
@@ -414,6 +411,7 @@ class IndexDocumentBundle implements CrawlConstants
         crawlLog("Number of documents in mapped partition:" .
             count($index_map));
         $cnt = 0;
+        $non_aux_doc_cnt = 0;
         $num_partition = count($index_map);
         $total_num_words = 0;
         foreach ($index_map as $hash_url => $url_info) {
@@ -569,10 +567,13 @@ class IndexDocumentBundle implements CrawlConstants
                     crawlLog("..Inverting " . $link_to . $site_url .
                     "...took > 5s.");
                 }
+                $non_aux_doc_cnt++;
                 if (crawlTimeoutLog("..Still building inverted index. Have ".
-                    "processed %s of %s documents.\nLast url processed was %s.",
-                    $cnt, $num_partition, $link_to . $site_url) &&
-                    $taking_too_long_touch) {
+                    "processed %s of %s documents.\n" .
+                    "Total links or docs processed is %s.\n" .
+                    "Last url processed was %s.",
+                    $non_aux_doc_cnt, $num_partition, $cnt,
+                    $link_to . $site_url) && $taking_too_long_touch) {
                     if (file_exists($taking_too_long_touch)) {
                         touch($taking_too_long_touch, time());
                     }
diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php
index 76a0145df..df28fe1e8 100644
--- a/src/library/PartitionDocumentBundle.php
+++ b/src/library/PartitionDocumentBundle.php
@@ -66,7 +66,7 @@ class PartitionDocumentBundle
     /**
      *
      */
-    const INDEX_CACHE_SIZE = 15;
+    const INDEX_CACHE_SIZE = 10;
     /**
      *
      */
ViewGit