Last commit for src/library/PackedTableTools.php: 88ba842636f692ac9bde972fed5a3cf6959d841b

Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle

Chris Pollett [2024-02-04 02:Feb:th]
Allows Arctool to rebuild/remerge a range of partitions, fixes term lookup bugs in WordIterator and IndexDocumentBundle
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2021  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2021
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;

/**
 * Loads crawlLog functions if needed
 */
require_once __DIR__ . "/Utility.php";
/**
 * A collection of methods to encode and decode records according to
 * a signature.
 *
 * @author Chris Pollett
 */
class PackedTableTools
{
    /**
     * A string compression algorithm used to compress rows respresented as
     * strings
     * @var seekquarry\yioop\library\compressor\Compressor
     */
    public $compressor;
    /**
     * This is the signature of the records this PackdTableTools will
     * manipulate. This should be an associative array of element of one of
     * the forms: "PRIMARY KEY" => column_name, "PRIMARY KEY" =>
     * [column_name, length_of_primary_key],  column_name => column_type pairs
     * column_type's are from among BOOL, TEXT, DOUBLE, REAL or their
     * synonyms as given in PackedTableTools::TYPE_SYNONYMS. There should
     * be only one primary key pair and if its value is not an array the
     * key length is assumed to be PackedTableTools::DEFAULT_KEY_LEN.
     * @var array
     */
    public $format;
    /**
     * Name of the column used for the primary key.
     * @var string
     */
    public $key_field;
    /**
     * Fixed number of bytes used to store the primary key.
     * @var int
     */
    public $key_len;
    /**
     * Number of columns in a record that are of type BOOL
     * @var int
     */
    public $num_bool_columns;
    /**
     * Number of columns in a record that are of type INT
     * @var int
     */
    public $num_int_columns;
    /**
     * Number of columns in a record that are of type TEXT
     * @var int
     */
    public $num_text_columns;
    /**
     * If not specified when constructing the instance, than this
     * will be the seekquarry\yioop\library\compressor\Compressor
     * used to compress rows.
     */
    const DEFAULT_COMPRESSOR = C\NS_COMPRESSORS . "NonCompressor";
    /**
     * If not specified by the format then this will be the assumed fixed
     * length of the primary key
     */
    const DEFAULT_KEY_LEN = 16;
    /**
     * Array of synonyms for the different possible column name types.
     */
    const TYPE_SYNONYMS = [ "BOOLEAN" => "BOOL", "BOOL" => "BOOL",
            "CLOB" => "TEXT", "DOUBLE" => "DOUBLE", "FLOAT" => "REAL",
            "INT" => "INT", "INTEGER" => "INT", "REAL"=> "REAL",
            "TEXT" => "TEXT"
        ];
    /**
     * Constant used to indicate that add() should be adding row to an
     * in memory table
     */
    const ADD_MEM_TABLE = 0;
    /**
     * Constant used to indicate that add() should be adding row using
     * a file handle
     */
    const ADD_FILE_HANDLE = 1;
    /**
     * Constant used to indicate that add() should be append row row
     * a specified file (not having a file handle to it yet)
     */
    const ADD_FILE_PATH = 2;
    /**
     * Constant used to indicate that add() should replace the value if
     * there already a row in the table with the same primary key
     */
    const REPLACE_MODE = 0;
    /**
     * Constant used to indicate that add() should cluster the value if
     * there already a row in the table with the same primary key
     */
    const APPEND_MODE = 1;
    /**
     * Used to create an instance of a PackedTableTools according to the
     * $format for record columns and $compressor_type to be used for row
     * compression.
     *
     * @param array associative array of items in one of the forms:
     * "PRIMARY KEY" => column_name, "PRIMARY KEY" =>
     * [column_name, length_of_primary_key],  column_name => column_type pairs
     * column_type's are from among BOOL, TEXT, DOUBLE, REAL or their
     * synonyms as given in PackedTableTools::TYPE_SYNONYMS.
     * @param seekquarry\yioop\library\compressor\Compressor used to compress
     *  records
     */
    public function __construct($format, $compressor_type =
        self::DEFAULT_COMPRESSOR)
    {
        $this->compressor = new $compressor_type();
        $this->format = [];
        $type_synonyms = self::TYPE_SYNONYMS;
        $i = 0;
        $this->key_field = null;
        $this->key_len = self::DEFAULT_KEY_LEN;
        $this->num_bool_columns = 0;
        $this->num_int_columns = 0;
        $this->num_text_columns = 0;
        foreach ($format as $field_name => $type) {
            if (strtoupper($field_name) == "PRIMARY KEY") {
                if (is_array($type)) {
                    $this->key_field = $type[0];
                    $this->key_len = $type[1];
                } else {
                    $this->key_field = $type;
                }
                continue;
            }
            $type = strtoupper($type);
            if (empty($type_synonyms[$type])) {
                return null;
            }
            $this->format[$field_name] = $type_synonyms[$type];
            if ($type == "BOOL") {
                $this->num_bool_columns++;
            } else if ($type == "INT") {
                $this->num_int_columns++;
            } else if ($type == "TEXT") {
                $this->num_text_columns++;
            }
        }
        if (empty($this->key_field)) {
            return null;
        }
    }
    /**
     * Adds ($key, $table_row) as an entry into $table
     * using the adding and replace methods specified
     *
     * @param mixed &$table either an associative array of key =>value
     *  pairs where the keys are strings of length matching this
     *  PackedTableTools sginature and the values have been packed according to
     *  this signature, or the file name of a file containing a serialized
     *  version of such a table, or a file handle to the end of such a file
     * @param string $key a key string of length given by the signature of this
     *      PackedTableTools
     * @param string $table_row a record packed according tot the signature
     *      of this PackedTableTools
     * @param int $add_method one of self::ADD_MEM_TABLE, self::ADD_FILE_HANDLE,
     *      self::ADD_FILE_PATH indicating which of the three possibilities
     *      for $table we have
     * @param int $mode either self::APPEND_MODE or self::REPLACE_MODE
     *      If the former the table_row data will be appended to any data
     *      currently associated with the key, if the latter it will replace
     *      such data
     * @return bool whether or not adding was successful
     */
    public function add(&$table, $key, $table_row,
        $add_method = self::ADD_MEM_TABLE, $mode = self::REPLACE_MODE)
    {
        $encode_key = ($this->key_len > 0) ? $key :
            chr(strlen($key)) . $key;
        switch ($add_method)
        {
            case self::ADD_FILE_PATH:
                $seperator = (fsize($table) > 0) ? "\xFF" : "";
                $out = $seperator . encode255($encode_key . $table_row);
                $out = $this->compressor->compress($out);
                return (file_put_contents($table, $out , FILE_APPEND) > 0);
            case self::ADD_FILE_HANDLE:
                $seperator = (ftell($table) > 0) ? "\xFF" : "";
                $out = $seperator . encode255($encode_key . $table_row);
                $out = $this->compressor->compress($out);
                return (fwrite($table, $out) > 0);
            case self::ADD_MEM_TABLE:
            default:
                if ($mode == self::REPLACE_MODE || empty($table[$key])) {
                    $table[$key] = $table_row;
                } else if ($mode == self::APPEND_MODE) {
                    $table[$key] = $this->mergeRowValues($table[$key],
                        $table_row);
                }
        }
        return true;
    }
    /**
     *  Merges two rows of items packed according to this packed table tools
     *  into a single row of items.
     *
     * @param string $row_values1 a row (less key)  packed according according
     *      to this packed tabled tools
     * @param string $row_values2 a second row (less key)  packed according
     *      according to this packed tabled tools
     * @return string a merged row consiting of the items in the first row
     *      followed by those in the second.
     */
    public function mergeRowValues($row_values1, $row_values2)
    {
        $row_values1_pos = 0;
        $row_values1_count = vByteDecode($row_values1,
            $row_values1_pos);
        $row_values2_pos = 0;
        $row_values2_count = vByteDecode($row_values2,
            $row_values2_pos);
        return vByteEncode($row_values1_count + $row_values2_count) .
            substr($row_values1, $row_values1_pos) .
            substr($row_values2, $row_values2_pos);
    }
    /**
     * Removes $key and any records associated with it from $table
     *
     * @param array& $table array of key => records pairs where the
     *  key is a string of length given by this PackedTableTool's signature
     *  and the records are packed according this PackedTableTool's signature
     * @param string $key to remove records for
     * @return bool success or failure of removal
     */
    public function delete(&$table, $key)
    {
        if (!empty($table[$key])) {
            unset($table[$key]);
            return true;
        }
        return false;
    }
    /**
     * Return any records in $table associated with $key
     *
     * @param array $table array of key => records pairs where the
     *  key is a string of length given by this PackedTableTool's signature
     *  and the records are packed according this PackedTableTool's signature
     * @param string $key to return records for
     * @return array of records that have been packed according to this
     *  PackedTableTool's signature
     */
    public function find($table, $key)
    {
        return $table[$key] ?? null;
    }
    /**
     *
     */
    public function load($table_path, $mode = self::REPLACE_MODE)
    {
        if (!file_exists($table_path)) {
            return null;
        }
        $pre_table = file_get_contents($table_path);
        $table = $this->compressor->uncompress($pre_table);
        $pre_rows = explode("\xFF", $table);
        $rows = [];
        $fixed_length_key = ($this->key_len > 0);
        $replace_mode = ($mode == self::REPLACE_MODE);
        $default_key_len = $this->key_len;
        $key_offset = $fixed_length_key ? 0 : 1;
        foreach ($pre_rows as $pre_row) {
            $entry = decode255($pre_row);
            $key_len = ($fixed_length_key) ? $default_key_len :
                ord($entry[0]);
            $key = substr($entry, $key_offset, $key_len);
            $new_values = substr($entry, $key_len + $key_offset);
            $rows[$key] = ($replace_mode || empty($rows[$key])) ? $new_values :
                $this->mergeRowValues($rows[$key], $new_values);
        }
        return $rows;
    }
    /**
     *
     */
    public function pack($items)
    {
        $format = $this->format;
        if (!isset($items[0])) {
            $items = [$items];
        }
        $packed_items = vByteEncode(count($items));
        foreach ($items as $item) {
            $bool_column_data = "";
            $cur_bool_int = -1;
            $cur_bool_shift = 7;
            $int_column_types = "";
            $text_column_lengths = "";
            $shift = 6;
            $cur_int_char = 0;
            $int_occurred = false;
            $packed_data = "";
            foreach ($format as $field_name => $type) {
                if (!isset($item[$field_name])) {
                    return null;
                }
                switch ($type)
                {
                    case "BOOL":
                        $cur_bool_int = ($cur_bool_int == -1) ? 0 :
                            $cur_bool_int;
                        $bool_value = ($item[$field_name]) ? 1 : 0;
                        $cur_bool_int += ($bool_value << $cur_bool_shift);
                        $cur_bool_shift--;
                        if ($cur_bool_shift < 0) {
                            $bool_column_data .= chr($cur_bool_int);
                            $cur_bool_shift = 7;
                            $cur_bool_int = -1;
                        }
                        break;
                    case "DOUBLE":
                        $packed_data .= pack("E", $item[$field_name]);
                        break;
                    case "INT":
                        $int_occurred = true;
                        $magnitude = abs($item[$field_name]);
                        $positive = ($item[$field_name] == $magnitude) ? 0 :
                            128;
                        if ($magnitude < 128) {
                            $packed_data .= chr($magnitude + $positive);
                            $cur_int_char = ($cur_int_char == -1) ? 0 :
                                $cur_int_char;
                        } else {
                            if ($magnitude < 32768) {
                                $packed_int = pack("n", $magnitude);
                                $cur_int_add = (1 << $shift);
                            } else if ($magnitude < 2147483647) {
                                $packed_int = pack("N", $magnitude);
                                $cur_int_add = (2 << $shift);
                            } else {
                                $packed_int = pack("J", $magnitude);
                                $cur_int_add = (3 << $shift);
                            }
                            if (!$positive) {
                                $packed_int[0] =
                                    chr(ord($packed_int[0]) + $positive);
                            }
                            $packed_data .= $packed_int;
                            $cur_int_char = max($cur_int_char, 0) +
                                $cur_int_add;
                        }
                        $shift -= 2;
                        if ($shift < 0) {
                            $int_column_types .= chr($cur_int_char);
                            $cur_int_char = -1;
                            $shift = 6;
                        }
                        break;
                    case "REAL":
                        $packed_data .= pack("G", $item[$field_name]);
                        break;
                    case "TEXT":
                        $len = strlen($item[$field_name]);
                        if ($len > 255) {
                            return null;
                        }
                        $packed_data .= $item[$field_name];
                        $text_column_lengths .= chr($len);
                        break;
                }
            }
            if ($cur_bool_int != -1) {
                $bool_column_data .= chr($cur_bool_int);
            }
            if ($cur_int_char != -1 && $int_occurred) {
                $int_column_types .= chr($cur_int_char);
            }
            $packed_items .= $bool_column_data .
                $int_column_types . $text_column_lengths . $packed_data;
        }
        return $packed_items;
    }
    /**
     * @return bool
     */
    public function save($table_path, $table)
    {
        $out = "";
        if ($this->key_len > 0) {
            foreach ($table as $key => $row) {
                $out .= "\xFF" . encode255($key . $row);
            }
        } else {
            foreach ($table as $key => $row) {
                $out .= "\xFF" . encode255(chr(strlen($key)) . $key . $row);
            }
        }
        $out = $this->compressor->compress(ltrim($out, "\xFF"));
        return (file_put_contents($table_path, $out) > 0);
    }
    /**
     * Given a table_row, which might represent several items grouped because
     * share a key, returns the total number of items stored in the row
     * @param string $table_row sequence of items packed according to this
     *      PackedTableTools signature
     * @return int number of items sctored in the table row
     */
    public function count($table_row)
    {
        $current_pos = 0;
        return vByteDecode($table_row, $current_pos);
    }
    /**
     * Given a table_row, which might represent several items grouped because
     * share a key, unpacks and returns the $offset through $limit numbered
     * items
     * @param string $table_row sequence of items packed according to this
     *      PackedTableTools signature
     * @param int $offset index of item to begin with
     * @param int $limit maximum number of items to return starting at offset
     * @return array unplacked items
     */
    public function unpack($table_row, $offset = 0, $limit = -1)
    {
        $num_int_columns = $this->num_int_columns;
        $num_text_columns = $this->num_text_columns;
        $num_bool_columns = $this->num_bool_columns;
        $bool_info_len = intval(ceil($this->num_bool_columns/8));
        $int_info_len = intval(ceil($num_int_columns / 4));
        $unpack_code = [1 => "n", 2=> "N", 3 => "J"];
        $current_pos = 0;
        $num_items = vByteDecode($table_row, $current_pos);
        if ($offset >= $num_items) {
            return [];
        }
        $limit = ($limit <= 0) ? $num_items : $limit;
        $num_items = min($limit + $offset, $num_items);
        for ($i = 0; $i < $num_items; $i++) {
            $bool_info = substr($table_row, $current_pos, $bool_info_len);
            $current_pos += $bool_info_len;
            $int_info = substr($table_row, $current_pos, $int_info_len);
            $current_pos += $int_info_len;
            $text_info = substr($table_row, $current_pos, $num_text_columns);
            $current_pos += $num_text_columns;
            $format = $this->format;
            $item = [];
            $current_bool_pos = 0;
            $cur_bool_shift = 7;
            $current_int_pos = 0;
            $current_text_pos = 0;
            $shift = 6;
            $bools_used = 0;
            $ints_used = 0;
            foreach ($format as $field_name => $type) {
                switch ($type)
                {
                    case "BOOL":
                        if ($bools_used >= $num_bool_columns) {
                            return null;
                        }
                        $item[$field_name] =
                            ((ord($bool_info[$current_bool_pos]) &
                            (1 << $cur_bool_shift)) >> $cur_bool_shift) > 0;
                        $bools_used ++;
                        $cur_bool_shift --;
                        if ($cur_bool_shift < 0) {
                            $current_bool_pos ++;
                            $cur_bool_shift = 7;
                        }
                        break;
                    case "DOUBLE":
                        $item[$field_name] = unpack("E", substr($table_row,
                            $current_pos, 8))[1];
                        $current_pos += 8;
                        break;
                    case "INT":
                        if ($ints_used >= $num_int_columns) {
                            return null;
                        }
                        $int_code = (ord($int_info[$current_int_pos]) &
                            (3 << $shift)) >> $shift;
                        if (!isset($table_row[$current_pos])) {
                            return null;
                        }
                        $first_char = ord($table_row[$current_pos]);
                        $len = 1 << $int_code;
                        if ($int_code == 0) {
                            $value = $first_char;
                            if ($value > 127) {
                                $value = - ($value & 127);
                            }
                            $item[$field_name] = $value;
                        } else {
                            $sign = ($first_char > 127) ? -1 : 1;
                            if ($sign < 0) {
                                $table_row[$current_pos] =
                                    chr($first_char - 128);
                            }
                            $pre_int = substr($table_row, $current_pos, $len);
                            $item[$field_name] = $sign * (unpack(
                                $unpack_code[$int_code], $pre_int)[1]);
                        }
                        $current_pos += $len;
                        $ints_used++;
                        $shift -= 2;
                        if ($shift < 0) {
                            $current_int_pos++;
                            $shift = 6;
                        }
                        break;
                    case "REAL":
                        $item[$field_name] = unpack("G", substr($table_row,
                            $current_pos, 4))[1];
                        $current_pos += 4;
                        break;
                    case "TEXT":
                        if ($current_text_pos >= $num_text_columns) {
                            return null;
                        }
                        $text_len = ord($text_info[$current_text_pos]);
                        $item[$field_name] = substr($table_row, $current_pos,
                            $text_len);
                        $current_pos += $text_len;
                        $current_text_pos++;
                        break;
                }
            }
            if ($i >= $offset) {
                $items[] = $item;
            }
        }
        return $items;
    }
}
ViewGit