Fix some notices, add more documentation, a=chris

Chris Pollett [2021-09-19 01:Sep:th]
Fix some notices, add more documentation, a=chris
Filename
src/library/BPlusTree.php
src/library/IndexDocumentBundle.php
src/library/LinearHashTable.php
src/library/PartitionDocumentBundle.php
src/models/GroupModel.php
src/models/ParallelModel.php
diff --git a/src/library/BPlusTree.php b/src/library/BPlusTree.php
index 616d96eb3..fbdc38adc 100644
--- a/src/library/BPlusTree.php
+++ b/src/library/BPlusTree.php
@@ -52,7 +52,7 @@ class BPlusTree
      */
     const MAX_CACHE_SIZE = 200;
     /**
-     *
+     * Default parameters to use when constructing a BPlusTree
      */
     const DEFAULT_PARAMETERS = ["COMPRESSOR" => self::DEFAULT_COMPRESSOR,
         "FORMAT" => ["PRIMARY KEY" => ["KEY", -1], "VALUE" => "BLOB"],
@@ -85,7 +85,8 @@ class BPlusTree
      */
     const NODE_PREFIX = "i";
     /**
-     *
+     * File name of file used to store the parameters of this
+     * BPlusTree
      */
     const PARAMETERS_FILE = "bpt_parameters.txt";
     /**
@@ -97,11 +98,15 @@ class BPlusTree
      */
     public $get_archive_cache = [null, "", -1];
     /**
-     *
+     * Array of column names for the columns in a BPlusTree which
+     * are of type BLOB or SERIAL
+     * @var array
      */
     public $blob_columns;
     /**
-     *
+     * The seekquarry\yioop\library\compressors\Compressor object used to
+     * compress record files and blob items.
+     * @var object
      */
     public $compressor;
     /**
@@ -118,7 +123,8 @@ class BPlusTree
      */
     public $instance_time;
     /**
-     *
+     * Name of primary key column for records
+     * @var string
      */
     public $key_field;
     /**
@@ -127,11 +133,15 @@ class BPlusTree
      */
     public $parameters;
     /**
-     *
+     * Array of column names for the columns in a PartitionDocumentBundle which
+     * are of type SERIAL
+     * @var array
      */
     public $serial_columns;
     /**
-     *
+     * The PackedTableTools object used to pack and unpack records in
+     * BPlusTree
+     * @var object
      */
     public $table_tools;
     /**
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 307733e17..a6c186845 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -107,22 +107,28 @@ class IndexDocumentBundle implements CrawlConstants
      */
     const LAST_ENTRIES_FILENAME = "last_entries";
     /**
-     *
+     * The filename of a file that is used to keep track of the integer that
+     * says what is the next partition with documents that can be added to
+     * this IndexDocumentBundle's dictionary. I.e., It should be that
+     * next_partition <= save_partition
      */
     const NEXT_PARTITION_FILE = "next_partition.txt";
     /**
-     *
+     * Names for the files which appear within a partition sub-folder
      */
     const PARTITION_FILENAMES = [self::DOC_MAP_FILENAME,
         self::LAST_ENTRIES_FILENAME, self::POSITIONS_FILENAME,
         self::POSTINGS_FILENAME];
     /**
-     * Name of the file within a paritions positions_doc_maps folder used
-     * to contain the partitions position list for each term.
+     * Name of the file within a partitions positions_doc_maps folder used
+     * to contain the partition's position list for all terms in partition.
      */
     const POSITIONS_FILENAME = "positions";
     /**
-     *
+     * Name of the file within a partition's positions_doc_maps folder with
+     * posting information for all terms in that partition. This consists of
+     * key value pairs term_id => posting records for all documents with that
+     * term.
      */
     const POSTINGS_FILENAME = "postings";
     /**
@@ -258,6 +264,9 @@ class IndexDocumentBundle implements CrawlConstants
         return $success;
     }
     /**
+     * For every partition between next partition and save partition, adds
+     * the posting list information to the dictionary BPlusTree. At the
+     * end of this process next partition and save partition should be the same
      *
      * @param string $taking_too_long_touch a filename of a file to touch
      *  so its last modified time becomes the current time. In a typical
@@ -385,7 +394,12 @@ class IndexDocumentBundle implements CrawlConstants
         }
     }
     /**
+     * Gets the file path corresponding to the partition with index $partition
      *
+     * @param int $partition desired partition index
+     * @return string file path to where this partitions index data is stored
+     *  (Not the original documents which are stored in the
+     *  PartitionDocumentBundle)
      */
     public function getPartitionBaseFolder($partition)
     {
@@ -394,19 +408,29 @@ class IndexDocumentBundle implements CrawlConstants
         return $base_folder;
     }
     /**
-     * @return array desired summary
+     * Given the $doc_id of a document and a $partition to look for it in
+     * return's the document summary info if present and [] otherwise.
+     *
+     * @param string $doc_id of document to look up
+     * @param int $partition to look for document in
+     * @return array desired summary or [] if look up failed
      */
-    public function getSummary($doc_key, $partition)
+    public function getSummary($doc_id, $partition)
     {
-        $row = $this->documents->get($doc_key, $partition, [self::SUMMARY]);
+        $row = $this->documents->get($doc_id, $partition, [self::SUMMARY]);
         return $row[self::SUMMARY] ?? [];
     }
     /**
-     * @return array desired page cache
+     * Given the $doc_id of a document and a $partition to look for it in
+     * return's the cached page of the document if present and [] otherwise
+     *
+     * @param string $doc_id of document to look up
+     * @param int $partition to look for document in
+     * @return array desired page cache or [] if look up failed
      */
-    public function getCachePage($doc_key, $partition)
+    public function getCachePage($doc_id, $partition)
     {
-        $row = $this->documents->get($doc_key, $partition, [self::PAGE]);
+        $row = $this->documents->get($doc_id, $partition, [self::PAGE]);
         return $row[self::PAGE] ?? [];
     }
     /**
@@ -678,7 +702,25 @@ class IndexDocumentBundle implements CrawlConstants
         return $doc_id;
     }
     /**
+     * Used to add a doci_id => doc_record to the current partition's
+     * document map ($doc_map). A doc record records the number of words
+     * in the document, an overall length of the document, the length of its
+     * title, scores for each of the sentences included into the summary
+     * for the documents, and classifier scores for each classifier that was
+     * used by the crawl.
      *
+     * @param array& $doc_map associative array of docid=>doc_record pairs
+     *  that this method will modify
+     * @param string $doc_id new document id to add a record for
+     * @param int $num_words number of terms in the document associated with the
+     *  doc-id
+     * @param float $score overall score for the important of this document
+     * @param int $title_length length of the title portion of the document
+     *  summary in terms
+     * @param array $description_scores pairs of the form (length of summary
+     *  portion, score for that portion)
+     * @param array $user_ranks for each user defined classifier for this crawl
+     *  the float score of the classifier on this document
      */
     public function addScoresDocMap(&$doc_map, $doc_id, $num_words, $score,
         $title_length, $description_scores, $user_ranks)
@@ -698,7 +740,30 @@ class IndexDocumentBundle implements CrawlConstants
         $doc_map_tools->add($doc_map, $doc_id, $entry);
     }
     /**
+     * Adds posting records associated to a document to the posting lists for
+     * a partition.
      *
+     * @param array& $postings associative array $term_id => posting list
+     *  records for that term in the partition.
+     * @param string $positions a string consisting of a concatenated sequence
+     *  term position information for each document in turn and within this for
+     *  each term in that document.
+     * @param int $position_offset number of header bytes that might be used
+     *  before including any position data in the file that positions will
+     *  eventually be stored.
+     * @param int $doc_length length of document in terms for the document
+     *  for which we are adding posting data.
+     * @param array $word_lists term => positions within current document of
+     *  that term for the document whose posting data we are adding
+     * @param array $meta_ids meta terms associated with the document we are
+     *  adding. An example, meta term might be "media:news"
+     * @param int $doc_map_index which document within the partition is the one
+     *  we are adding. I.e., 5 would mean there were 5 earlier documents whose
+     *  postings we have already added.
+     * @param array& $last_entries used to keep track of the previous values
+     *  posting quantities so difference lists can be computed. For example,
+     *  previous $doc_map_index, previous position list offset. It also tracks
+     *  the total number of occurences of a term within a partition.
      */
     public function addTermPostingLists(&$postings, &$positions,
         $position_offset, $doc_length, $word_lists, $meta_ids, $doc_map_index,
@@ -747,6 +812,7 @@ class IndexDocumentBundle implements CrawlConstants
     }
     /**
      * Checks if a doc_id correspons to a document or a link
+     *
      * @param string $key
      * @return bool true if a document
      */
@@ -755,9 +821,20 @@ class IndexDocumentBundle implements CrawlConstants
         return $key[self::DOCID_PART_LEN << 1] == 'd';
     }
     /**
+     * As pre-step to calculating the inverted index information for a parition
+     * this method groups documents and links to documents into single objects.
+     * It also does simple deduplication of documents that have the same hash.
+     * It then returns an array of the grouped document data.
      *
-     * @param int $partition
-     * @param array $test_index
+     * @param int $partition index of partition to do deduplication for
+     *  in the case that test index is empty
+     * @param array $test_index is non-null only when doing testing of what
+     *  this method does. In which case, it should consist of an array
+     *  of $doc_id => string represent a possible record for that doc.
+     *  As deduplication is done entirely based on component of the doc_id
+     *  (hash_url, doc_type, hash_doc, hash_host) the string doesn't matter
+     *  too much.
+     * @return array groups doc_id => records associated with that doc_id
      */
     public function prepareIndexMap($partition, $test_index = [])
     {
diff --git a/src/library/LinearHashTable.php b/src/library/LinearHashTable.php
index 80d309629..d3ff440c8 100644
--- a/src/library/LinearHashTable.php
+++ b/src/library/LinearHashTable.php
@@ -48,11 +48,11 @@ class LinearHashTable
      */
     const ACTIVE_INDEX = "active";
     /**
-     *
+     * Compression strategy used to compress blob and serial columns
      */
     const DEFAULT_COMPRESSOR = C\NS_COMPRESSORS . "NonCompressor";
     /**
-     *
+     * Default parameters to use when constructing a LinearHashTable
      */
     const DEFAULT_PARAMETERS = ["COMPRESSOR" => self::DEFAULT_COMPRESSOR,
         "COUNT" => 0, "PARTITION_SIZE_THRESHOLD" =>
@@ -74,19 +74,22 @@ class LinearHashTable
      */
     const KEY_PREFIX = "key_";
     /**
-     *
+     * Default maximum number of records to store in a hash table partition
      */
     const MAX_ITEMS_PER_FILE = 16384;
     /**
-     *
+     * File name of file used to store the parameters of this
+     * LinearHashTable
      */
     const PARAMETERS_FILE = "lht_parameters.txt";
     /**
-     *
+     * Prefix to file names of LinearHashTable partition files
      */
     const PARTITION_PREFIX = "partition_";
     /**
-     *
+     * Maximum number of bytes a partition can have before a split.
+     * Notice this implies a maximum file size to store
+     * in BLOB columns
      */
     const PARTITION_SIZE_THRESHOLD = 2147483648;
     /**
@@ -110,15 +113,20 @@ class LinearHashTable
      */
     public $get_archive_cache = [null, "", -1];
     /**
-     *
+     * Array of column names for the columns in a BPlusTree which
+     * are of type BLOB or SERIAL
+     * @var array
      */
     public $blob_columns;
     /**
-     *
+     * The seekquarry\yioop\library\compressors\Compressor object used to
+     * compress record files and blob items.
+     * @var object
      */
     public $compressor;
     /**
-     *
+     * Folder path where the LinearHashTable is stored
+     * @var string
      */
     public $folder;
     /**
@@ -130,19 +138,26 @@ class LinearHashTable
      */
     public $instance_time;
     /**
-     *
+     * Name of primary key column for records
+     * @var string
      */
     public $key_field;
     /**
-     *
+     * Stores the constructor parameters used to create this
+     * LinearHashTable
+     * @var array
      */
     public $parameters;
     /**
-     *
+     * Array of column names for the columns in a PartitionDocumentBundle which
+     * are of type SERIAL
+     * @var array
      */
     public $serial_columns;
     /**
-     *
+     * The PackedTableTools object used to pack and unpack records in
+     * LinearHashTable
+     * @var object
      */
     public $table_tools;
     /**
diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php
index 0a3b1200e..33b6d7d05 100644
--- a/src/library/PartitionDocumentBundle.php
+++ b/src/library/PartitionDocumentBundle.php
@@ -94,7 +94,7 @@ class PartitionDocumentBundle
      */
     const PARTITION_PREFIX = "partition_";
     /**
-     * Maximum number of bytes a partition cna have before the next partition
+     * Maximum number of bytes a partition can have before the next partition
      * is started. Notice this implies a maximum file size to store
      * in BLOB columns
      */
@@ -152,7 +152,7 @@ class PartitionDocumentBundle
      */
     public $key_field;
     /**
-     * Stores the constructor paramters used to create this
+     * Stores the constructor parameters used to create this
      * PartitionDocumentBundle
      * @var array
      */
diff --git a/src/models/GroupModel.php b/src/models/GroupModel.php
index 8a33712ea..334c74439 100644
--- a/src/models/GroupModel.php
+++ b/src/models/GroupModel.php
@@ -2455,7 +2455,7 @@ EOD;
                     list(, $eval_data[$k][$m]) = $this->evaluateCell(
                         $data[$i][$j], 1, $eval_data);
                 } else {
-                    $eval_data[$k][$m] = $data[$i][$j];
+                    $eval_data[$k][$m] = $data[$i][$j] ?? 0;
                 }
                 if ($i >= $rectangle[0][0] && $j >= $rectangle[0][1]) {
                     $out_data[$k][$m] = $eval_data[$k][$m];
diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php
index 046c56dd3..2b72f4607 100755
--- a/src/models/ParallelModel.php
+++ b/src/models/ParallelModel.php
@@ -168,7 +168,7 @@ class ParallelModel extends Model
             } else {
                 foreach ($lookup_info as $lookup_item) {
                     $out_lookup_info = [];
-                    if (count($lookup_item) == 5) {
+                    if (is_array($lookup_item) && count($lookup_item) == 5) {
                         list($index, , , , ) = $lookup_item;
                         $machines[$index] = $machine_urls[$index];
                     } else {
ViewGit