Reorganizing IndexDocumentBundle folder structures, attempt to fix bug wherein thumbs not showing on wiki media pages

Chris Pollett [2024-01-09 02:Jan:th]
Reorganizing IndexDocumentBundle folder structures, attempt to fix bug wherein thumbs not showing on wiki media pages
Filename
src/configs/Config.php
src/library/IndexDictionary.php
src/library/IndexDocumentBundle.php
src/library/PartitionDocumentBundle.php
src/library/StochasticTermSegmenter.php
src/models/GroupModel.php
tests/IndexDocumentBundleTest.php
diff --git a/src/configs/Config.php b/src/configs/Config.php
index 30574ae45..ba199f6e6 100755
--- a/src/configs/Config.php
+++ b/src/configs/Config.php
@@ -437,6 +437,10 @@ nsconddefine('JOBS_DIR', SCHEDULES_DIR . "/jobs");
  * Directory used by the web page clissfiers classes
  */
 nsconddefine('CLASSIFIERS_DIR', WORK_DIRECTORY . "/classifiers");
+/**
+ *
+ */
+nsconddefine('OVERFLOW_THRESHOLD', -1);

 /** Captcha mode indicating to use a hash cash computation for a captcha*/
 nsdefine('HASH_CAPTCHA', 2);
diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php
index b26d751dc..2912a317c 100644
--- a/src/library/IndexDictionary.php
+++ b/src/library/IndexDictionary.php
@@ -552,7 +552,7 @@ class IndexDictionary implements CrawlConstants
      *      record should be negation of higher order bit of the given prefix
      *      letter used by the tier file.
      * @return string a single record with merged strings making use of
-     *      auxliary records as needed containing
+     *      auxiliary records as needed containing
      *      (generation, posting list offset, length) information.
      */
     public function combineDictionaryRecord($record_a, $record_b, $prefix_bit)
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index c8e2344ff..6098f55ce 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -66,7 +66,7 @@ class IndexDocumentBundle implements CrawlConstants
      * Subfolder of IndexDocumentBundle to store the btree with
      * term => posting list information (i.e., the inverted index)
      */
-    const DICTIONARY_FOLDER = "dictionary";
+    const DICTIONARY_FOLDER = "Dictionary";
     /**
      * DocIds are made of three parts: hash of url, hash of document, hash
      * of url hostname. Each of these hashes is  DOCID_PART_LEN long
@@ -120,7 +120,7 @@ class IndexDocumentBundle implements CrawlConstants
      * .ix files which are used to store doc_id and the associated offsets to
      * their summary and actual document within the .txt.gz file
      */
-    const DOCUMENTS_FOLDER = "documents";
+    const DOCUMENTS_FOLDER = "Documents";
     /**
      * Name of the last entries file used to help compute difference lists
      * for doc_map_index, and position list offsets used in postings for the
@@ -153,25 +153,11 @@ class IndexDocumentBundle implements CrawlConstants
      * term.
      */
     const POSTINGS_FILENAME = "postings";
-    /**
-     * Temporary name for postings from a POSTINGS_FILENAME file while
-     * they are being compressed.
-     */
-    const TEMP_POSTINGS_FILENAME = "temp_postings";
     /**
      * How many bytes of posting to buffer before writing, when
      * addPartitionPostingsDictionary
      */
     const POSTINGS_BUFFER_SIZE = 1000000;
-    /**
-     * Name of the folder used to hold position lists and document maps. Within
-     * this folder there is a subfolder for each partition which contains a
-     * doc_map file, postings file for the docs within the partition,
-     * position lists file for those postings, and a last_entries file
-     * used in the computation of difference list for doc_map_index and position
-     * list offsets, as well as number of occurrences of terms.
-     */
-    const POSITIONS_DOC_MAP_FOLDER = "positions_doc_maps";
     /**
      * Holds property value pairs concerning the configuration of the
      * current IndexDocumentBundle
@@ -283,13 +269,13 @@ class IndexDocumentBundle implements CrawlConstants
      */
     public function __construct($dir_name, $read_only_archive = true,
         $description = null, $num_docs_per_partition =
-        C\NUM_DOCS_PER_PARTITION, $max_keys = BPlusTree::MAX_KEYS)
+        C\NUM_DOCS_PER_PARTITION, $max_keys = BPlusTree::MAX_KEYS,
+        $overflow_threshold = C\OVERFLOW_THRESHOLD)
     {
         $this->dir_name = $dir_name;
         $is_dir = is_dir($this->dir_name);
         if (!$is_dir && !$read_only_archive) {
             mkdir($this->dir_name);
-            mkdir($this->dir_name . "/". self::POSITIONS_DOC_MAP_FOLDER);
         } else if (!$is_dir) {
             return false;
         }
@@ -342,7 +328,8 @@ class IndexDocumentBundle implements CrawlConstants
             self::SUMMARY => "SERIAL", self::PAGE => "SERIAL"],
             $num_docs_per_partition,
             PartitionDocumentBundle::PARTITION_SIZE_THRESHOLD,
-            $record_compressor, $blob_compressor);
+            $record_compressor, $blob_compressor,
+            $overflow_threshold);
         if (!$read_only_archive) {
             $this->documents->index_cache_size = 1;
         }
@@ -511,28 +498,23 @@ class IndexDocumentBundle implements CrawlConstants
         $start_time = microtime(true);
         $postings_string = $postings_tools->load($postings_filename,
             PackedTableTools::AS_STRING_MODE);
-        $temp_postings_filename = $base_folder . "/" .
-            self::TEMP_POSTINGS_FILENAME;
-        rename($postings_filename, $temp_postings_filename);
         $posting_files_len = strlen($postings_string);
         //add a marker for the end of the file as a string
         $key_len = $this->postings_tools->key_len;
         $this->last_entries = $last_entries_tools->load($last_entries_filename);
         $num_postings = substr_count($postings_string, "\xFF") + 1;
         $last_marker = 0;
-        $out_postings = "";
-        $postings_offset = 0;
-        $fh = fopen($postings_filename, "w");
         for ($i = 0; $i < $num_postings; $i++) {
             $cur_marker = strpos($postings_string, "\xFF", $last_marker);
             $diff = ($cur_marker === false) ? null :
                 $cur_marker - $last_marker;
             $pre_row = substr($postings_string, $last_marker, $diff);
+            $postings_offset = $last_marker + $key_len;
             $last_marker = $cur_marker + 1;
             $term = substr($pre_row, 0, $key_len);
-            $row = decode255(substr($pre_row, $key_len));
-            $postings_len = strlen($row);
-            $out_postings .= $row;
+            $encode_row = substr($pre_row, $key_len);
+            $postings_len = strlen($encode_row);
+            $row = decode255($encode_row);
             if (crawlTimeoutLog("..Indexer Still processing partition ".
                 "$partition. Have completed $i postings of $num_postings.") &&
                 $taking_too_long_touch) {
@@ -554,25 +536,10 @@ class IndexDocumentBundle implements CrawlConstants
                 "NUM_OCCURRENCES"  => $num_occurrences_term,
                 "POSTINGS_OFFSET" => $postings_offset,
                 "POSTINGS_LEN" => $postings_len]);
-            $postings_offset += $postings_len;
-            if (strlen($out_postings) > self::POSTINGS_BUFFER_SIZE) {
-                fwrite($fh, $out_postings);
-                $out_postings = "";
-            }
         }
         $dictionary->flushLastPutNode();
-        fwrite($fh, $out_postings);
-        fclose($fh);
-        unlink($temp_postings_filename);
         crawlLog("...Finished Adding Partition Posting Info to " .
             "Dictionary: " . changeInMicrotime($start_time));
-        if (!C\nsdefined("KEEP_PARTITION_CALCULATIONS") ||
-            !C\KEEP_PARTITION_CALCULATIONS) {
-            if (file_exists($last_entries_filename)) {
-                unlink($last_entries_filename);
-            }
-            crawlLog("..Done deleting partition posting calculations.");
-        }
     }
     /**
      * Gets the file path corresponding to the partition with index $partition
@@ -584,9 +551,7 @@ class IndexDocumentBundle implements CrawlConstants
      */
     public function getPartitionBaseFolder($partition)
     {
-        $base_folder = $this->dir_name . "/" . self::POSITIONS_DOC_MAP_FOLDER
-            . "/$partition";
-        return $base_folder;
+        return $this->documents->getPartitionFolder($partition);
     }
     /**
      * Given the $doc_id of a document and a $partition to look for it in
@@ -639,13 +604,6 @@ class IndexDocumentBundle implements CrawlConstants
         crawlLog(
             "Indexer Building index inverted index for partition $partition");
         $base_folder = $this->getPartitionBaseFolder($partition);
-        if (!file_exists($base_folder)) {
-            if (!file_exists($this->dir_name . "/".
-                self::POSITIONS_DOC_MAP_FOLDER)) {
-                mkdir($this->dir_name . "/". self::POSITIONS_DOC_MAP_FOLDER);
-            }
-            mkdir($base_folder);
-        }
         /* set up $doc_map_filename, $postings_filename, $postings_filename,
            $positions_filename, etc
          */
@@ -1457,7 +1415,7 @@ class IndexDocumentBundle implements CrawlConstants
      * @param int $threshold after the number of results exceeds this amount
      *     stop looking for more dictionary entries.
      * @param int $offset
-     * @param int $num_partitions
+     * @param int $num_partitions
      * @param bool $with_remaining_total whether to total number of
      *      postings found as well or not
      * @return array either [total, sequence of four tuples]
diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php
index 084ffb70a..27cbd0a95 100644
--- a/src/library/PartitionDocumentBundle.php
+++ b/src/library/PartitionDocumentBundle.php
@@ -64,18 +64,13 @@ class PartitionDocumentBundle
      * Default parameters to use when constructing a PartitionDocumentBundle
      */
     const DEFAULT_PARAMETERS = ["RECORD_COMPRESSOR" => self::DEFAULT_COMPRESSOR,
-        "BLOB_COMPRESSOR" => self::DEFAULT_COMPRESSOR,
-        "COUNT" => 0, "PARTITION_SIZE_THRESHOLD" =>
-        self::PARTITION_SIZE_THRESHOLD,
+        "BLOB_COMPRESSOR" => self::DEFAULT_COMPRESSOR, "COUNT" => 0,
+        "OVERFLOW_THRESHOLD" => C\OVERFLOW_THRESHOLD,
+        "PARTITION_SIZE_THRESHOLD" => self::PARTITION_SIZE_THRESHOLD,
         "FORMAT" => ["PRIMARY KEY" => "KEY", "VALUE" => "BLOB"],
         "MAX_ITEMS_PER_FILE" => self::MAX_ITEMS_PER_FILE,
         "SAVE_PARTITION" => 0, "ACTIVE_COUNT" => 0
     ];
-    /**
-     * Extension for PartitionDocumentBundle partition files used to contain
-     * records
-     */
-    const INDEX_EXTENSION = ".ix";
     /**
      * Default maximum number of records to store in a partition
      */
@@ -85,16 +80,37 @@ class PartitionDocumentBundle
      * PartitionDocumentBundle
      */
     const PARAMETERS_FILE = "pdb_parameters.txt";
+    /**
+     * Prefix to block folders of PartitionDocumentBundle partition files
+     */
+    const BLOCK_PREFIX = "B";
+    /**
+     * Number of partition files to store in a block folder before making
+     * another one
+     */
+    const BLOCK_FACTOR = 10000;
+    /**
+     *
+     */
+    const ARCHIVE_FILENAME = "archive";
+    /**
+     *
+     */
+    const INDEX_FILENAME = "index";
     /**
      * Prefix to file names of PartitionDocumentBundle partition files
      */
-    const PARTITION_PREFIX = "partition_";
+    const PARTITION_PREFIX = "P";
     /**
      * Maximum number of bytes a partition can have before the next partition
      * is started. Notice this implies a maximum file size to store
      * in BLOB columns
      */
     const PARTITION_SIZE_THRESHOLD = 2147483648;
+    /**
+     *
+     */
+    const OVERFLOW_DIR_FOLDER = "Overflow";
     /**
      * Used to store the file handle to, the partition number, and last add time
      * for the last time an item's blob/serial columns were added to for
@@ -200,7 +216,8 @@ class PartitionDocumentBundle
         $max_items_per_file = self::MAX_ITEMS_PER_FILE,
         $partition_size_threshold = self::PARTITION_SIZE_THRESHOLD,
         $record_compressor_type = self::DEFAULT_COMPRESSOR,
-        $blob_compressor_type = self::DEFAULT_COMPRESSOR)
+        $blob_compressor_type = self::DEFAULT_COMPRESSOR,
+        $overflow_threshold = C\OVERFLOW_DIR_THRESHOLD)
     {
         $initial_parameters = self::DEFAULT_PARAMETERS;
         $initial_parameters["PARTITION_SIZE_THRESHOLD"] =
@@ -208,6 +225,7 @@ class PartitionDocumentBundle
         $initial_parameters["MAX_ITEMS_PER_FILE"] = $max_items_per_file;
         $initial_parameters["RECORD_COMPRESSOR"] = $record_compressor_type;
         $initial_parameters["BLOB_COMPRESSOR"] = $blob_compressor_type;
+        $initial_parameters["OVERFLOW_THRESHOLD"] = $overflow_threshold;
         $this->record_compressor = new $record_compressor_type();
         $this->blob_compressor = new $blob_compressor_type();
         $initial_parameters["FORMAT"] = $format;
@@ -216,11 +234,15 @@ class PartitionDocumentBundle
             ini_get('memory_limit'))/128000000));
         $this->folder = $folder;
         $folder_paths = [$folder];
+        if ($overflow_threshold > 0) {
+            $folder_paths[] = $folder . "/" . self::OVERFLOW_DIR_FOLDER;
+        }
         $changed_parameters = false;
         foreach ($folder_paths as $folder_path) {
             if (!file_exists($folder_path)) {
                 $changed_parameters = true;
                 if (!mkdir($folder_path)) {
+                    chmod($folder_path, 0777);
                     return null;
                 }
             }
@@ -383,6 +405,25 @@ class PartitionDocumentBundle
             $previous_instance_time];
         return $value;
     }
+    /**
+     *
+     */
+    public function getPartitionBlock($i)
+    {
+        $threshold = $this->parameters["OVERFLOW_THRESHOLD"];
+        $folder = ($threshold > 0 && $i > $threshold) ?
+            $this->folder . "/" . self::OVERFLOW_DIR_FOLDER : $this->folder;
+        $block = sprintf("%'.05d", floor($i / self::BLOCK_FACTOR));
+        return $folder . "/". self::BLOCK_PREFIX . "$block";
+    }
+    /**
+     *
+     */
+    public function getPartitionFolder($i)
+    {
+        return $this->getPartitionBlock($i) . "/" . self::PARTITION_PREFIX .
+            sprintf("%'.010d", $i);
+    }
     /**
      * Returns the path to the archive file (used to store BLOB and SERIAL
      * columns) for the $i partition in this PartitionDocumentBundle
@@ -392,8 +433,8 @@ class PartitionDocumentBundle
      */
     public function getPartition($i)
     {
-        return $this->folder . "/" . self::PARTITION_PREFIX .
-            $i . $this->blob_compressor->fileExtension();
+        return $this->getPartitionFolder($i) . "/" . self::ARCHIVE_FILENAME .
+            $this->blob_compressor->fileExtension();
     }
     /**
      * Returns the path to the index file (used to store all columns
@@ -405,8 +446,7 @@ class PartitionDocumentBundle
      */
     public function getPartitionIndex($i)
     {
-        return $this->folder . "/" . self::PARTITION_PREFIX .
-            $i . self::INDEX_EXTENSION;
+        return $this->getPartitionFolder($i) . "/"  . self::INDEX_FILENAME;
     }
     /**
      * Returns the unserialized index file for the $partition partition of
@@ -488,6 +528,18 @@ class PartitionDocumentBundle
         // remove $save_partition from read cache
         unset($this->index_cache[$save_partition]);
         $save_partition_name = $this->getPartition($save_partition);
+        if (!file_exists($save_partition_name)) {
+            $save_block_name = $this->getPartitionBlock($save_partition);
+            if (!file_exists($save_block_name)) {
+                mkdir($save_block_name);
+                chmod($save_block_name, 0777);
+            }
+            $save_folder_name = $this->getPartitionFolder($save_partition);
+            if (!file_exists($save_folder_name)) {
+                mkdir($save_folder_name);
+                chmod($save_folder_name, 0777);
+            }
+        }
         clearstatcache();
         $save_partition_len = file_exists($save_partition_name) ?
             filesize($save_partition_name) : 0;
@@ -598,6 +650,20 @@ class PartitionDocumentBundle
         if (file_exists($new_save_index_name)) {
             unlink($new_save_index_name);
         }
+        $new_save_file_name = $this->getPartition($new_save_partition);
+        if (file_exists($new_save_file_name)) {
+            unlink($new_save_file_name);
+        }
+        $new_save_block_name = $this->getPartitionBlock($new_save_partition);
+        if (!file_exists($new_save_block_name)) {
+            mkdir($new_save_block_name);
+            chmod($new_save_block_name, 0777);
+        }
+        $new_save_folder_name = $this->getPartitionFolder($new_save_partition);
+        if (!file_exists($new_save_folder_name)) {
+            mkdir($new_save_folder_name);
+            chmod($new_save_folder_name, 0777);
+        }
         $this->parameters["SAVE_PARTITION"] = $new_save_partition;
         $this->parameters['COUNT'] += $this->parameters['ACTIVE_COUNT'];
         $this->parameters['ACTIVE_COUNT'] = 0;
@@ -657,8 +723,26 @@ class PartitionDocumentBundle
             }
             if (empty($parameters['SAVE_PARTITION']) ||
                 $parameters['SAVE_PARTITION'] == 0) {
-                $parameters['SAVE_PARTITION'] =
-                    max(count(glob("$folder/*" . self::INDEX_EXTENSION))-1, 0);
+                $block_folders = glob("$folder/" . self::BLOCK_PREFIX . "*");
+                $parameters['SAVE_PARTITION'] = 0;
+                if (!empty($block_folders)) {
+                    $overflow_block_folders = glob("$folder/" .
+                        self::OVERFLOW_DIR_FOLDER . "/" .
+                        self::BLOCK_PREFIX . "*");
+                    if (!empty($overflow_block_folders) &&
+                        count($overflow_block_folders) > 0) {
+                        $block_folders = $overflow_block_folders;
+                    }
+                    sort($block_folders);
+                    $last_folder = $block_folders[count($block_folders) - 1];
+                    $partition_path = "$last_folder/" . self::PARTITION_PREFIX;
+                    $partition_folders = glob("$partition_path*");
+                    sort($partition_folders);
+                    $last_index_file = $partition_folders[
+                        count($partition_folders) - 1];
+                    $parameters['SAVE_PARTITION'] = intval(
+                        substr($last_index_file, strlen($partition_path)));
+                }
             }
             return $parameters;
         } else {
diff --git a/src/library/StochasticTermSegmenter.php b/src/library/StochasticTermSegmenter.php
index 73137e923..364858fe9 100644
--- a/src/library/StochasticTermSegmenter.php
+++ b/src/library/StochasticTermSegmenter.php
@@ -545,4 +545,4 @@ class StochasticTermSegmenter
         // Set end of term marker
         $sub_trie['$'] = $frequency;
     }
-}
\ No newline at end of file
+}
diff --git a/src/models/GroupModel.php b/src/models/GroupModel.php
index 5b3762f61..54f2460c0 100644
--- a/src/models/GroupModel.php
+++ b/src/models/GroupModel.php
@@ -4537,9 +4537,10 @@ EOD;
             if (is_writable($pre_resource)) {
                 $resource['is_writable'] = true;
             }
-            if (in_array($name . ".jpg", $thumbs)) {
+            if (in_array($name . ".jpg", $thumbs) ||
+                in_array($name . ".webp", $thumbs)) {
                 $resource['has_thumb'] = true;
-                if (in_array($name.".gif", $thumbs)) {
+                if (in_array($name . ".gif", $thumbs)) {
                     $resource['has_animated_thumb'] = true;
                 }
             } else if ($thumb_folder && !$resource['is_dir'] &&
diff --git a/tests/IndexDocumentBundleTest.php b/tests/IndexDocumentBundleTest.php
index 5716120d4..c1eedca6b 100644
--- a/tests/IndexDocumentBundleTest.php
+++ b/tests/IndexDocumentBundleTest.php
@@ -320,6 +320,7 @@ use seekquarry\yioop\library\UnitTest;
         }
         $this->assertEqual($sum + count($active_postings), $num_docs,
             "Term 'be' occurs in correct number of documents");
+        $key_len = $posting_tools->key_len;
         for ($i = 0; $i < 2; $i++) {
             $partition = $term_row[$i]['PARTITION'];
             $partition_folder = $this->index_archive->getPartitionBaseFolder(
ViewGit