Reduce logic to retrieve a cached getIndex call, a=chris

Chris Pollett [2021-09-12 20:Sep:th]
Reduce logic to retrieve a cached getIndex call, a=chris
Filename
src/executables/QueueServer.php
src/library/IndexDocumentBundle.php
src/library/IndexManager.php
src/library/index_bundle_iterators/DocIterator.php
src/library/index_bundle_iterators/WordIterator.php
tests/IndexDocumentBundleTest.php
tests/IndexManagerTest.php
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index cd266739f..8b2d955ce 100755
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -599,8 +599,7 @@ class QueueServer implements CrawlConstants
                 "running.  Assume still running.");
             return;
         }
-        $filters = ($process == self::INDEXER) ? ["Indexer", "merg",
-            "index shard"] : ["Scheduler"];
+        $filters = ($process == self::INDEXER) ? ["Indexer"] : ["Scheduler"];
         $process_lines = L\lineFilter($lines, $filters);
         L\crawlLog("...Filtered " . $this->process_name . ".log lines");
         $num_lines = count($process_lines);
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index e6df704fe..4c8997e92 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -222,7 +222,7 @@ class IndexDocumentBundle implements CrawlConstants
      */
     public function addPages($pages, $visited_urls_count)
     {
-        crawlLog("Adding pages to document bundle...");
+        crawlLog("Indexer adding pages to document bundle...");
         $success = $this->documents->put($pages);
         $this->documents->addCount($visited_urls_count,
             "VISITED_URLS_COUNT");
@@ -231,7 +231,11 @@ class IndexDocumentBundle implements CrawlConstants
     /**
      *
      * @param int $add_num_docs number of docs in the shard about to be added
-     * @param string $taking_too_long_touch
+     * @param string $taking_too_long_touch a filename of a file to touch
+     *  so its last modified time becomes the current time. In a typical
+     *  Yioop crawl this is done for the crawl_status.txt file to prevent
+     *  Yioop's web interface from stopping the crawl because it has seen
+     *  no recent  progress activity on a crawl.
      */
     public function updateDictionary($taking_too_long_touch = null)
     {
@@ -270,9 +274,16 @@ class IndexDocumentBundle implements CrawlConstants
         return $advanced_partition;
     }
     /**
+     * Adds the previously constructed inverted index $partition to the inverted
+     * index of the whole bundle
      *
-     * @param int $partition;
-     * @param string $taking_too_long_touch
+     * @param int $partition which partitions inverted index to add, by
+     *  default the current save partition
+     * @param string $taking_too_long_touch a filename of a file to touch
+     *  so its last modified time becomes the current time. In a typical
+     *  Yioop crawl this is done for the crawl_status.txt file to prevent
+     *  Yioop's web interface from stopping the crawl because it has seen
+     *  no recent  progress activity on a crawl.
      */
     public function addPartitionPostingsDictionary($partition = -1,
         $taking_too_long_touch = null)
@@ -374,21 +385,26 @@ class IndexDocumentBundle implements CrawlConstants
      * Builds an inverted index shard for a documents PartitionDocumentBundle
      * partition.
      * @param int $partition to build index for
-     * @param string $taking_too_long_touch
+     * @param string $taking_too_long_touch a filename of a file to touch
+     *  so its last modified time becomes the current time. In a typical
+     *  Yioop crawl this is done for the crawl_status.txt file to prevent
+     *  Yioop's web interface from stopping the crawl because it has seen
+     *  no recent  progress activity on a crawl.
      * @return mixed whether job executed to completion (true or false) if
-     *      !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS,
-     *      and TERM_STATISTICS (the latter having term frequency info)
+     *  !$just_stats, otherwise, an array with NUM_DOCS, NUM_LINKS,
+     *  and TERM_STATISTICS (the latter having term frequency info)
      */
     public function buildInvertedIndexPartition($partition = -1,
         $taking_too_long_touch = null, $just_stats = false)
     {
         $start_time = microtime(true);
-        crawlLog("  Start building inverted index ...  Current Memory:".
+        crawlLog("  Indexer start building inverted index ...  Current Memory:".
             memory_get_usage());
         if ($partition < 0) {
             $partition = $this->documents->parameters["SAVE_PARTITION"];
         }
-        crawlLog("Building index inverted index for partition $partition");
+        crawlLog(
+            "Indexer Building index inverted index for partition $partition");
         $base_folder = $this->getPartitionBaseFolder($partition);
         if (!file_exists($base_folder)) {
             mkdir($base_folder);
@@ -411,7 +427,7 @@ class IndexDocumentBundle implements CrawlConstants
         $postings = [];
         $last_entries = [];
         $positions = "";
-        crawlLog("Preparing Index Map...");
+        crawlLog("Indexer Preparing Index Map...");
         $index_map = $this->prepareIndexMap($partition);
         crawlLog("Number of documents in mapped partition:" .
             count($index_map));
@@ -577,13 +593,13 @@ class IndexDocumentBundle implements CrawlConstants
                     $word_lists, $meta_ids, $doc_map_index, $last_entries);
                 $interim_elapse = changeInMicrotime($interim_time);
                 if ($interim_elapse > 5) {
-                    crawlLog("..Inverting " . $link_to . $site_url .
+                    crawlLog("..Indexer Inverting " . $link_to . $site_url .
                     "...took > 5s.");
                 }
-                if (crawlTimeoutLog("..Still building inverted index. Have ".
-                    "processed %s of %s documents.\n" .
-                    "Total links or docs processed is %s.\n" .
-                    "Last url processed was %s.",
+                if (crawlTimeoutLog("..Indexer Still building inverted index.".
+                    "\n....Have processed %s of %s documents.\n" .
+                    "....Total links or docs processed is %s.\n" .
+                    "....Last url processed was %s.",
                     $non_aux_doc_cnt, $num_partition, $non_aux_doc_cnt + $cnt,
                     $link_to . $site_url) && $taking_too_long_touch) {
                     if (file_exists($taking_too_long_touch)) {
@@ -609,7 +625,7 @@ class IndexDocumentBundle implements CrawlConstants
         $postings_tools->save($postings_filename, $postings);
         $last_entries_tools->save($last_entries_filename, $last_entries);
         file_put_contents($positions_filename, $positions);
-        crawlLog("  Build inverted index time ".
+        crawlLog("  Indexer build inverted index time ".
             changeInMicrotime($start_time));
         return true;
     }
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index 8eb4f1c4f..5f357cf34 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -70,6 +70,12 @@ class IndexManager implements CrawlConstants
     public static function getIndex($index_name)
     {
         $index_name = trim($index_name); //trim to fix postgres quirkiness
+        if (!empty(self::$index_times[$index_name]) &&
+            ($index_name != 'feed' || php_sapi_name() != 'cli' ||
+            (time() - self::$index_times[$index_name])
+            < C\MIN_QUERY_CACHE_TIME) ) {
+            return self::$indexes[$index_name];
+        }
         $cache_dir = C\CRAWL_DIR . '/cache/';
         $index_archive_full_path = $cache_dir . self::index_data_base_name .
             $index_name;
@@ -109,74 +115,68 @@ class IndexManager implements CrawlConstants
         } else if (!$handled) {
             return false;
         }
-        if (empty(self::$indexes[$index_name]) ||
-            (!empty(self::$index_times[$index_name]) &&
-            ($index_name == 'feed' && php_sapi_name() == 'cli') &&
-            (time() - self::$index_times[$index_name])
-            > C\MIN_QUERY_CACHE_TIME) ) {
-            if (!isset($tmp)) {
-                $index_filename = $cache_dir . $index_archive_name;
-                if (file_exists($index_filename)) {
-                    $is_old = (file_exists($index_filename . "/summaries"));
-                    $bundle_class_name = ($is_old) ? C\NS_LIB .
-                        "IndexArchiveBundle" : C\NS_LIB . "IndexDocumentBundle";
-                    $tmp = new $bundle_class_name($cache_dir .
-                        $index_archive_name);
-                    if (!$tmp) {
-                        return false;
-                    }
-                } else {
-                    $tmp = false;
-                    $use_name = $index_name;
-                    $serve_archive = -1;
-                    if (preg_match("/\-\d$/", $index_name)) {
-                        $serve_archive = substr($index_name, -1);
-                        $use_name = substr($index_name, 0, -2);
-                    }
-                    $index_archive_name = self::double_index_base_name .
-                        $use_name;
-                    $status_file = $cache_dir . $index_archive_name .
-                        "/status.txt";
-                    if ($serve_archive < 0 && file_exists($status_file)) {
-                        $status = unserialize(file_get_contents($status_file));
-                        $active_archive = (empty($status["swap_count"])) ? 1 :
-                            $status["swap_count"] % 2;
-                        $serve_archive = 1 - $active_archive;
-                    }
-                    $is_old = (file_exists($index_filename .
-                        "/bundle0/summaries") ||
-                        file_exists($index_filename . "/bundle1/summaries"));
-                    $bundle_class_name = ($is_old) ?
-                        C\NS_LIB . "IndexArchiveBundle"
-                        : C\NS_LIB . "IndexDocumentBundle";
+        if (!isset($tmp)) {
+            $index_filename = $cache_dir . $index_archive_name;
+            if (file_exists($index_filename)) {
+                $is_old = (file_exists($index_filename . "/summaries"));
+                $bundle_class_name = ($is_old) ? C\NS_LIB .
+                    "IndexArchiveBundle" : C\NS_LIB . "IndexDocumentBundle";
+                $tmp = new $bundle_class_name($cache_dir .
+                    $index_archive_name);
+                if (!$tmp) {
+                    return false;
+                }
+            } else {
+                $tmp = false;
+                $use_name = $index_name;
+                $serve_archive = -1;
+                if (preg_match("/\-\d$/", $index_name)) {
+                    $serve_archive = substr($index_name, -1);
+                    $use_name = substr($index_name, 0, -2);
+                }
+                $index_archive_name = self::double_index_base_name .
+                    $use_name;
+                $status_file = $cache_dir . $index_archive_name .
+                    "/status.txt";
+                if ($serve_archive < 0 && file_exists($status_file)) {
+                    $status = unserialize(file_get_contents($status_file));
+                    $active_archive = (empty($status["swap_count"])) ? 1 :
+                        $status["swap_count"] % 2;
+                    $serve_archive = 1 - $active_archive;
+                }
+                $is_old = (file_exists($index_filename .
+                    "/bundle0/summaries") ||
+                    file_exists($index_filename . "/bundle1/summaries"));
+                $bundle_class_name = ($is_old) ?
+                    C\NS_LIB . "IndexArchiveBundle"
+                    : C\NS_LIB . "IndexDocumentBundle";
+                $tmp = new $bundle_class_name($cache_dir .
+                    $index_archive_name . "/bundle$serve_archive");
+                if (!$tmp) {
+                    $serve_archive = ($serve_archive == 0) ? 1 : 0;
                     $tmp = new $bundle_class_name($cache_dir .
                         $index_archive_name . "/bundle$serve_archive");
-                    if (!$tmp) {
-                        $serve_archive = ($serve_archive == 0) ? 1 : 0;
-                        $tmp = new $bundle_class_name($cache_dir .
-                            $index_archive_name . "/bundle$serve_archive");
-                    }
-                    if (!$tmp) {
-                        return false;
-                    }
+                }
+                if (!$tmp) {
+                    return false;
                 }
             }
-            self::$indexes[$index_name] = $tmp;
-            if ($is_old) {
-                self::$indexes[$index_name]->setCurrentShard(0, true);
-            }
-            self::$index_times[$index_name] = time();
-            /*
-               If too many cached discard oldest 1/3 of cached indices
-             */
-            if (count(self::$indexes) > self::INDEX_CACHE_SIZE) {
-                $times = array_values(self::$index_times);
-                sort($times);
-                $oldest_third = $times[floor(count($times)/3)];
-                foreach (self::$index_times as $name => $time) {
-                    if ($time <= $oldest_third) {
-                        unset(self::$index_times[$name], self::$indexes[$name]);
-                    }
+        }
+        self::$indexes[$index_name] = $tmp;
+        if ($is_old) {
+            self::$indexes[$index_name]->setCurrentShard(0, true);
+        }
+        self::$index_times[$index_name] = time();
+        /*
+           If too many cached discard oldest 1/3 of cached indices
+         */
+        if (count(self::$indexes) > self::INDEX_CACHE_SIZE) {
+            $times = array_values(self::$index_times);
+            sort($times);
+            $oldest_third = $times[floor(count($times)/3)];
+            foreach (self::$index_times as $name => $time) {
+                if ($time <= $oldest_third) {
+                    unset(self::$index_times[$name], self::$indexes[$name]);
                 }
             }
         }
diff --git a/src/library/index_bundle_iterators/DocIterator.php b/src/library/index_bundle_iterators/DocIterator.php
index c17d4735e..e6368e564 100755
--- a/src/library/index_bundle_iterators/DocIterator.php
+++ b/src/library/index_bundle_iterators/DocIterator.php
@@ -180,7 +180,8 @@ class DocIterator extends IndexBundleIterator
                     $doc_map_filename = $base_folder . "/" .
                         IndexDocumentBundle::DOC_MAP_FILENAME;
                     $doc_map_tools = $index->doc_map_tools;
-                    $this->doc_map = $doc_map_tools->load($doc_map_filename);
+                    $this->doc_map = $doc_map_tools->load($doc_map_filename)
+                        ?? [];
                     $doc_keys = array_keys($this->doc_map);
                     $key_index = [];
                     foreach ($this->doc_map as $key => $entry) {
diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php
index 87c67cb53..91a72366b 100644
--- a/src/library/index_bundle_iterators/WordIterator.php
+++ b/src/library/index_bundle_iterators/WordIterator.php
@@ -732,7 +732,7 @@ class WordIterator extends IndexBundleIterator
                         $this->dictionary_info[$this->generation_pointer];
                     $this->current_generation = $partition_info['PARTITION'];
                     $this->start_offset = 0;
-                    $this->last_offset = $partition_info['NUM_DOCS'] - 1;
+                    $this->last_offset = ($partition_info['NUM_DOCS'] ?? 1) - 1;
                 }
                 $this->current_offset = ($is_ascending) ? $this->start_offset:
                     $this->last_offset;
diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php
index 254ef4148..9ea2e35af 100644
--- a/tests/IndexDocumentBundleTest.php
+++ b/tests/IndexDocumentBundleTest.php
@@ -315,14 +315,31 @@ use seekquarry\yioop\library\UnitTest;
         }
     }
     /**
+     * Computes a 24 byte docId by padding an int to the left with 0's
      *
+     * @param int $i integer to make docId from
+     * @return string docid made by padding
      */
     protected function docidFromInt($i)
     {
             return str_pad("$i", 24, "0", STR_PAD_LEFT);
     }
     /**
+     * docids are typically made from three 8byte strings. This function
+     * takes three ints and left pads each with '0' (\x30) and concatenates
+     * then to make a 24 byte docid. As docids use their 8 byte to say whether
+     * the id is for a document (replace with 'd') or a link (replace with 'l')
+     * this function uses the value of the $is_doc flag to determine which value
+     * overwrite the 8th byte with.
      *
+     * @param int $i_hash_url an int for first 8 bytes (in non-artificial docids
+     *      would be for the crawlHash of url document from)
+     * @param int $j_hash_page an int for first 8 bytes (in non-artificial
+     *      docids would be for the crawlHash of document)
+     * @param int $k_hash_host an int for first 8 bytes (in non-artificial
+     *      docids would be for the crawlHash of hostname of site document from)
+     * @param bool $is_doc whether the hash is for a document or a link
+     * @return string 24 byte docid.
      */
     protected function docidFromIntKeys($i_hash_url, $j_hash_page,
         $k_hash_host, $is_doc = true)
diff --git a/tests/IndexManagerTest.php b/tests/IndexManagerTest.php
index 33caf7312..138bd2acf 100644
--- a/tests/IndexManagerTest.php
+++ b/tests/IndexManagerTest.php
@@ -41,7 +41,8 @@ use seekquarry\yioop\library\IndexManager;
 use seekquarry\yioop\library\UnitTest;

 /**
- *
+ * Used to run unit tests for the IndexManager class. IndexManager acts a
+ * a resource manager for the open indexes used to process a query.
  */
  class IndexManagerTest extends UnitTest
 {
@@ -57,10 +58,9 @@ use seekquarry\yioop\library\UnitTest;
      * Prefix of folders for index manager test
      */
     const TEST_DIR = __DIR__ . '/test_files/index_manager_test';
-
     /**
-     * Sets up an array to keep track of what linear hash tables we've made
-     * so that we can delete them when done a test.
+     * Before each test sets up a test directory with an old and new bundle in
+     * it
      */
     public function setUp()
     {
@@ -81,7 +81,7 @@ use seekquarry\yioop\library\UnitTest;
             "/$new_bundle", false, "TestBundle", 3, 5);
     }
     /**
-     *
+     * Used after each test to delete the test directory
      */
     public function tearDown()
     {
@@ -89,7 +89,8 @@ use seekquarry\yioop\library\UnitTest;
         $model->db->unlinkRecursive(self::TEST_DIR);
     }
     /**
-     *
+     * Test case if IndexManager can determine the type and version of an index
+     * to be able to return an instantiated version fo that index.
      */
     public function getIndexTestCase()
     {
@@ -101,7 +102,8 @@ use seekquarry\yioop\library\UnitTest;
         $this->assertTrue($new_index, "Get existing new bundle not null");
     }
     /**
-     *
+     * Test case used to test whether or not the index manager can determine
+     * the version of a Yioop index.
      */
     public function getVersionTestCase()
     {
@@ -113,7 +115,8 @@ use seekquarry\yioop\library\UnitTest;
         $this->assertEqual($version_new, 3, "Version 3 index detected");
     }
     /**
-     *
+     * Tests if IndexManager can return the dictionary information about a
+     * word stored in an index it manages.
      */
     public function getWordInfoTestCase()
     {
@@ -166,7 +169,21 @@ use seekquarry\yioop\library\UnitTest;
         return str_pad("$i", 24, "0", STR_PAD_LEFT);
     }
     /**
+     * docids are typically made from three 8byte strings. This function
+     * takes three ints and left pads each with '0' (\x30) and concatenates
+     * then to make a 24 byte docid. As docids use their 8 byte to say whether
+     * the id is for a document (replace with 'd') or a link (replace with 'l')
+     * this function uses the value of the $is_doc flag to determine which value
+     * overwrite the 8th byte with.
      *
+     * @param int $i_hash_url an int for first 8 bytes (in non-artificial docids
+     *      would be for the crawlHash of url document from)
+     * @param int $j_hash_page an int for first 8 bytes (in non-artificial
+     *      docids would be for the crawlHash of document)
+     * @param int $k_hash_host an int for first 8 bytes (in non-artificial
+     *      docids would be for the crawlHash of hostname of site document from)
+     * @param bool $is_doc whether the hash is for a document or a link
+     * @return string 24 byte docid.
      */
     protected function docidFromIntKeys($i_hash_url, $j_hash_page,
         $k_hash_host, $is_doc = true)
ViewGit