diff --git a/src/library/BPlusTree.php b/src/library/BPlusTree.php
index 616d96eb3..fbdc38adc 100644
--- a/src/library/BPlusTree.php
+++ b/src/library/BPlusTree.php
@@ -52,7 +52,7 @@ class BPlusTree
*/
const MAX_CACHE_SIZE = 200;
/**
- *
+ * Default parameters to use when constructing a BPlusTree
*/
const DEFAULT_PARAMETERS = ["COMPRESSOR" => self::DEFAULT_COMPRESSOR,
"FORMAT" => ["PRIMARY KEY" => ["KEY", -1], "VALUE" => "BLOB"],
@@ -85,7 +85,8 @@ class BPlusTree
*/
const NODE_PREFIX = "i";
/**
- *
+ * File name of file used to store the parameters of this
+ * BPlusTree
*/
const PARAMETERS_FILE = "bpt_parameters.txt";
/**
@@ -97,11 +98,15 @@ class BPlusTree
*/
public $get_archive_cache = [null, "", -1];
/**
- *
+ * Array of column names for the columns in a BPlusTree which
+ * are of type BLOB or SERIAL
+ * @var array
*/
public $blob_columns;
/**
- *
+ * The seekquarry\yioop\library\compressors\Compressor object used to
+ * compress record files and blob items.
+ * @var object
*/
public $compressor;
/**
@@ -118,7 +123,8 @@ class BPlusTree
*/
public $instance_time;
/**
- *
+ * Name of primary key column for records
+ * @var string
*/
public $key_field;
/**
@@ -127,11 +133,15 @@ class BPlusTree
*/
public $parameters;
/**
- *
+ * Array of column names for the columns in a PartitionDocumentBundle which
+ * are of type SERIAL
+ * @var array
*/
public $serial_columns;
/**
- *
+ * The PackedTableTools object used to pack and unpack records in
+ * BPlusTree
+ * @var object
*/
public $table_tools;
/**
diff --git a/src/library/IndexDocumentBundle.php b/src/library/IndexDocumentBundle.php
index 307733e17..a6c186845 100644
--- a/src/library/IndexDocumentBundle.php
+++ b/src/library/IndexDocumentBundle.php
@@ -107,22 +107,28 @@ class IndexDocumentBundle implements CrawlConstants
*/
const LAST_ENTRIES_FILENAME = "last_entries";
/**
- *
+ * The filename of a file that is used to keep track of the integer that
+ * says what is the next partition with documents that can be added to
+ * this IndexDocumentBundle's dictionary. I.e., It should be that
+ * next_partition <= save_partition
*/
const NEXT_PARTITION_FILE = "next_partition.txt";
/**
- *
+ * Names for the files which appear within a partition sub-folder
*/
const PARTITION_FILENAMES = [self::DOC_MAP_FILENAME,
self::LAST_ENTRIES_FILENAME, self::POSITIONS_FILENAME,
self::POSTINGS_FILENAME];
/**
- * Name of the file within a paritions positions_doc_maps folder used
- * to contain the partitions position list for each term.
+ * Name of the file within a partitions positions_doc_maps folder used
+ * to contain the partition's position list for all terms in partition.
*/
const POSITIONS_FILENAME = "positions";
/**
- *
+ * Name of the file within a partition's positions_doc_maps folder with
+ * posting information for all terms in that partition. This consists of
+ * key value pairs term_id => posting records for all documents with that
+ * term.
*/
const POSTINGS_FILENAME = "postings";
/**
@@ -258,6 +264,9 @@ class IndexDocumentBundle implements CrawlConstants
return $success;
}
/**
+ * For every partition between next partition and save partition, adds
+ * the posting list information to the dictionary BPlusTree. At the
+ * end of this process next partition and save partition should be the same
*
* @param string $taking_too_long_touch a filename of a file to touch
* so its last modified time becomes the current time. In a typical
@@ -385,7 +394,12 @@ class IndexDocumentBundle implements CrawlConstants
}
}
/**
+ * Gets the file path corresponding to the partition with index $partition
*
+ * @param int $partition desired partition index
+ * @return string file path to where this partitions index data is stored
+ * (Not the original documents which are stored in the
+ * PartitionDocumentBundle)
*/
public function getPartitionBaseFolder($partition)
{
@@ -394,19 +408,29 @@ class IndexDocumentBundle implements CrawlConstants
return $base_folder;
}
/**
- * @return array desired summary
+ * Given the $doc_id of a document and a $partition to look for it in
+ * return's the document summary info if present and [] otherwise.
+ *
+ * @param string $doc_id of document to look up
+ * @param int $partition to look for document in
+ * @return array desired summary or [] if look up failed
*/
- public function getSummary($doc_key, $partition)
+ public function getSummary($doc_id, $partition)
{
- $row = $this->documents->get($doc_key, $partition, [self::SUMMARY]);
+ $row = $this->documents->get($doc_id, $partition, [self::SUMMARY]);
return $row[self::SUMMARY] ?? [];
}
/**
- * @return array desired page cache
+ * Given the $doc_id of a document and a $partition to look for it in
+ * return's the cached page of the document if present and [] otherwise
+ *
+ * @param string $doc_id of document to look up
+ * @param int $partition to look for document in
+ * @return array desired page cache or [] if look up failed
*/
- public function getCachePage($doc_key, $partition)
+ public function getCachePage($doc_id, $partition)
{
- $row = $this->documents->get($doc_key, $partition, [self::PAGE]);
+ $row = $this->documents->get($doc_id, $partition, [self::PAGE]);
return $row[self::PAGE] ?? [];
}
/**
@@ -678,7 +702,25 @@ class IndexDocumentBundle implements CrawlConstants
return $doc_id;
}
/**
+ * Used to add a doci_id => doc_record to the current partition's
+ * document map ($doc_map). A doc record records the number of words
+ * in the document, an overall length of the document, the length of its
+ * title, scores for each of the sentences included into the summary
+ * for the documents, and classifier scores for each classifier that was
+ * used by the crawl.
*
+ * @param array& $doc_map associative array of docid=>doc_record pairs
+ * that this method will modify
+ * @param string $doc_id new document id to add a record for
+ * @param int $num_words number of terms in the document associated with the
+ * doc-id
+ * @param float $score overall score for the important of this document
+ * @param int $title_length length of the title portion of the document
+ * summary in terms
+ * @param array $description_scores pairs of the form (length of summary
+ * portion, score for that portion)
+ * @param array $user_ranks for each user defined classifier for this crawl
+ * the float score of the classifier on this document
*/
public function addScoresDocMap(&$doc_map, $doc_id, $num_words, $score,
$title_length, $description_scores, $user_ranks)
@@ -698,7 +740,30 @@ class IndexDocumentBundle implements CrawlConstants
$doc_map_tools->add($doc_map, $doc_id, $entry);
}
/**
+ * Adds posting records associated to a document to the posting lists for
+ * a partition.
*
+ * @param array& $postings associative array $term_id => posting list
+ * records for that term in the partition.
+ * @param string $positions a string consisting of a concatenated sequence
+ * term position information for each document in turn and within this for
+ * each term in that document.
+ * @param int $position_offset number of header bytes that might be used
+ * before including any position data in the file that positions will
+ * eventually be stored.
+ * @param int $doc_length length of document in terms for the document
+ * for which we are adding posting data.
+ * @param array $word_lists term => positions within current document of
+ * that term for the document whose posting data we are adding
+ * @param array $meta_ids meta terms associated with the document we are
+ * adding. An example, meta term might be "media:news"
+ * @param int $doc_map_index which document within the partition is the one
+ * we are adding. I.e., 5 would mean there were 5 earlier documents whose
+ * postings we have already added.
+ * @param array& $last_entries used to keep track of the previous values
+ * posting quantities so difference lists can be computed. For example,
+ * previous $doc_map_index, previous position list offset. It also tracks
+ * the total number of occurences of a term within a partition.
*/
public function addTermPostingLists(&$postings, &$positions,
$position_offset, $doc_length, $word_lists, $meta_ids, $doc_map_index,
@@ -747,6 +812,7 @@ class IndexDocumentBundle implements CrawlConstants
}
/**
* Checks if a doc_id correspons to a document or a link
+ *
* @param string $key
* @return bool true if a document
*/
@@ -755,9 +821,20 @@ class IndexDocumentBundle implements CrawlConstants
return $key[self::DOCID_PART_LEN << 1] == 'd';
}
/**
+ * As pre-step to calculating the inverted index information for a parition
+ * this method groups documents and links to documents into single objects.
+ * It also does simple deduplication of documents that have the same hash.
+ * It then returns an array of the grouped document data.
*
- * @param int $partition
- * @param array $test_index
+ * @param int $partition index of partition to do deduplication for
+ * in the case that test index is empty
+ * @param array $test_index is non-null only when doing testing of what
+ * this method does. In which case, it should consist of an array
+ * of $doc_id => string represent a possible record for that doc.
+ * As deduplication is done entirely based on component of the doc_id
+ * (hash_url, doc_type, hash_doc, hash_host) the string doesn't matter
+ * too much.
+ * @return array groups doc_id => records associated with that doc_id
*/
public function prepareIndexMap($partition, $test_index = [])
{
diff --git a/src/library/LinearHashTable.php b/src/library/LinearHashTable.php
index 80d309629..d3ff440c8 100644
--- a/src/library/LinearHashTable.php
+++ b/src/library/LinearHashTable.php
@@ -48,11 +48,11 @@ class LinearHashTable
*/
const ACTIVE_INDEX = "active";
/**
- *
+ * Compression strategy used to compress blob and serial columns
*/
const DEFAULT_COMPRESSOR = C\NS_COMPRESSORS . "NonCompressor";
/**
- *
+ * Default parameters to use when constructing a LinearHashTable
*/
const DEFAULT_PARAMETERS = ["COMPRESSOR" => self::DEFAULT_COMPRESSOR,
"COUNT" => 0, "PARTITION_SIZE_THRESHOLD" =>
@@ -74,19 +74,22 @@ class LinearHashTable
*/
const KEY_PREFIX = "key_";
/**
- *
+ * Default maximum number of records to store in a hash table partition
*/
const MAX_ITEMS_PER_FILE = 16384;
/**
- *
+ * File name of file used to store the parameters of this
+ * LinearHashTable
*/
const PARAMETERS_FILE = "lht_parameters.txt";
/**
- *
+ * Prefix to file names of LinearHashTable partition files
*/
const PARTITION_PREFIX = "partition_";
/**
- *
+ * Maximum number of bytes a partition can have before a split.
+ * Notice this implies a maximum file size to store
+ * in BLOB columns
*/
const PARTITION_SIZE_THRESHOLD = 2147483648;
/**
@@ -110,15 +113,20 @@ class LinearHashTable
*/
public $get_archive_cache = [null, "", -1];
/**
- *
+ * Array of column names for the columns in a BPlusTree which
+ * are of type BLOB or SERIAL
+ * @var array
*/
public $blob_columns;
/**
- *
+ * The seekquarry\yioop\library\compressors\Compressor object used to
+ * compress record files and blob items.
+ * @var object
*/
public $compressor;
/**
- *
+ * Folder path where the LinearHashTable is stored
+ * @var string
*/
public $folder;
/**
@@ -130,19 +138,26 @@ class LinearHashTable
*/
public $instance_time;
/**
- *
+ * Name of primary key column for records
+ * @var string
*/
public $key_field;
/**
- *
+ * Stores the constructor parameters used to create this
+ * LinearHashTable
+ * @var array
*/
public $parameters;
/**
- *
+ * Array of column names for the columns in a PartitionDocumentBundle which
+ * are of type SERIAL
+ * @var array
*/
public $serial_columns;
/**
- *
+ * The PackedTableTools object used to pack and unpack records in
+ * LinearHashTable
+ * @var object
*/
public $table_tools;
/**
diff --git a/src/library/PartitionDocumentBundle.php b/src/library/PartitionDocumentBundle.php
index 0a3b1200e..33b6d7d05 100644
--- a/src/library/PartitionDocumentBundle.php
+++ b/src/library/PartitionDocumentBundle.php
@@ -94,7 +94,7 @@ class PartitionDocumentBundle
*/
const PARTITION_PREFIX = "partition_";
/**
- * Maximum number of bytes a partition cna have before the next partition
+ * Maximum number of bytes a partition can have before the next partition
* is started. Notice this implies a maximum file size to store
* in BLOB columns
*/
@@ -152,7 +152,7 @@ class PartitionDocumentBundle
*/
public $key_field;
/**
- * Stores the constructor paramters used to create this
+ * Stores the constructor parameters used to create this
* PartitionDocumentBundle
* @var array
*/
diff --git a/src/models/GroupModel.php b/src/models/GroupModel.php
index 8a33712ea..334c74439 100644
--- a/src/models/GroupModel.php
+++ b/src/models/GroupModel.php
@@ -2455,7 +2455,7 @@ EOD;
list(, $eval_data[$k][$m]) = $this->evaluateCell(
$data[$i][$j], 1, $eval_data);
} else {
- $eval_data[$k][$m] = $data[$i][$j];
+ $eval_data[$k][$m] = $data[$i][$j] ?? 0;
}
if ($i >= $rectangle[0][0] && $j >= $rectangle[0][1]) {
$out_data[$k][$m] = $eval_data[$k][$m];
diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php
index 046c56dd3..2b72f4607 100755
--- a/src/models/ParallelModel.php
+++ b/src/models/ParallelModel.php
@@ -168,7 +168,7 @@ class ParallelModel extends Model
} else {
foreach ($lookup_info as $lookup_item) {
$out_lookup_info = [];
- if (count($lookup_item) == 5) {
+ if (is_array($lookup_item) && count($lookup_item) == 5) {
list($index, , , , ) = $lookup_item;
$machines[$index] = $machine_urls[$index];
} else {