<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2021 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license https://www.gnu.org/licenses/ GPL3 * @link https://www.seekquarry.com/ * @copyright 2009 - 2021 * @filesource */ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** * Loads crawlLog functions if needed */ require_once __DIR__ . "/Utility.php"; /** * A collection of methods to encode and decode records according to * a signature. * * @author Chris Pollett */ class PackedTableTools { /** * A string compression algorithm used to compress rows respresented as * strings * @var seekquarry\yioop\library\compressor\Compressor */ public $compressor; /** * This is the signature of the records this PackdTableTools will * manipulate. This should be an associative array of element of one of * the forms: "PRIMARY KEY" => column_name, "PRIMARY KEY" => * [column_name, length_of_primary_key], column_name => column_type pairs * column_type's are from among BOOL, TEXT, DOUBLE, REAL or their * synonyms as given in PackedTableTools::TYPE_SYNONYMS. There should * be only one primary key pair and if its value is not an array the * key length is assumed to be PackedTableTools::DEFAULT_KEY_LEN. * @var array */ public $format; /** * Name of the column used for the primary key. * @var string */ public $key_field; /** * Fixed number of bytes used to store the primary key. * @var int */ public $key_len; /** * Number of columns in a record that are of type BOOL * @var int */ public $num_bool_columns; /** * Number of columns in a record that are of type INT * @var int */ public $num_int_columns; /** * Number of columns in a record that are of type TEXT * @var int */ public $num_text_columns; /** * If not specified when constructing the instance, than this * will be the seekquarry\yioop\library\compressor\Compressor * used to compress rows. */ const DEFAULT_COMPRESSOR = C\NS_COMPRESSORS . "NonCompressor"; /** * If not specified by the format then this will be the assumed fixed * length of the primary key */ const DEFAULT_KEY_LEN = 16; /** * Array of synonyms for the different possible column name types. */ const TYPE_SYNONYMS = [ "BOOLEAN" => "BOOL", "BOOL" => "BOOL", "CLOB" => "TEXT", "DOUBLE" => "DOUBLE", "FLOAT" => "REAL", "INT" => "INT", "INTEGER" => "INT", "REAL"=> "REAL", "TEXT" => "TEXT" ]; /** * Constant used to indicate that add() should be adding row to an * in memory table */ const ADD_MEM_TABLE = 0; /** * Constant used to indicate that add() should be adding row using * a file handle */ const ADD_FILE_HANDLE = 1; /** * Constant used to indicate that add() should be append row row * a specified file (not having a file handle to it yet) */ const ADD_FILE_PATH = 2; /** * Constant used to indicate that add() should replace the value if * there already a row in the table with the same primary key */ const REPLACE_MODE = 0; /** * Constant used to indicate that add() should cluster the value if * there already a row in the table with the same primary key */ const APPEND_MODE = 1; /** * Used to create an instance of a PackedTableTools according to the * $format for record columns and $compressor_type to be used for row * compression. * * @param array associative array of items in one of the forms: * "PRIMARY KEY" => column_name, "PRIMARY KEY" => * [column_name, length_of_primary_key], column_name => column_type pairs * column_type's are from among BOOL, TEXT, DOUBLE, REAL or their * synonyms as given in PackedTableTools::TYPE_SYNONYMS. * @param seekquarry\yioop\library\compressor\Compressor used to compress * records */ public function __construct($format, $compressor_type = self::DEFAULT_COMPRESSOR) { $this->compressor = new $compressor_type(); $this->format = []; $type_synonyms = self::TYPE_SYNONYMS; $i = 0; $this->key_field = null; $this->key_len = self::DEFAULT_KEY_LEN; $this->num_bool_columns = 0; $this->num_int_columns = 0; $this->num_text_columns = 0; foreach ($format as $field_name => $type) { if (strtoupper($field_name) == "PRIMARY KEY") { if (is_array($type)) { $this->key_field = $type[0]; $this->key_len = $type[1]; } else { $this->key_field = $type; } continue; } $type = strtoupper($type); if (empty($type_synonyms[$type])) { return null; } $this->format[$field_name] = $type_synonyms[$type]; if ($type == "BOOL") { $this->num_bool_columns++; } else if ($type == "INT") { $this->num_int_columns++; } else if ($type == "TEXT") { $this->num_text_columns++; } } if (empty($this->key_field)) { return null; } } /** * Adds ($key, $table_row) as an entry into $table * using the adding and replace methods specified * * @param mixed &$table either an associative array of key =>value * pairs where the keys are strings of length matching this * PackedTableTools sginature and the values have been packed according to * this signature, or the file name of a file containing a serialized * version of such a table, or a file handle to the end of such a file * @param string $key a key string of length given by the signature of this * PackedTableTools * @param string $table_row a record packed according tot the signature * of this PackedTableTools * @param int $add_method one of self::ADD_MEM_TABLE, self::ADD_FILE_HANDLE, * self::ADD_FILE_PATH indicating which of the three possibilities * for $table we have * @param int $mode either self::APPEND_MODE or self::REPLACE_MODE * If the former the table_row data will be appended to any data * currently associated with the key, if the latter it will replace * such data * @return bool whether or not adding was successful */ public function add(&$table, $key, $table_row, $add_method = self::ADD_MEM_TABLE, $mode = self::REPLACE_MODE) { $encode_key = ($this->key_len > 0) ? $key : chr(strlen($key)) . $key; switch ($add_method) { case self::ADD_FILE_PATH: $seperator = (fsize($table) > 0) ? "\xFF" : ""; $out = $seperator . encode255($encode_key . $table_row); $out = $this->compressor->compress($out); return (file_put_contents($table, $out , FILE_APPEND) > 0); case self::ADD_FILE_HANDLE: $seperator = (ftell($table) > 0) ? "\xFF" : ""; $out = $seperator . encode255($encode_key . $table_row); $out = $this->compressor->compress($out); return (fwrite($table, $out) > 0); case self::ADD_MEM_TABLE: default: if ($mode == self::REPLACE_MODE || empty($table[$key])) { $table[$key] = $table_row; } else if ($mode == self::APPEND_MODE) { $table[$key] = $this->mergeRowValues($table[$key], $table_row); } } return true; } /** * Merges two rows of items packed according to this packed table tools * into a single row of items. * * @param string $row_values1 a row (less key) packed according according * to this packed tabled tools * @param string $row_values2 a second row (less key) packed according * according to this packed tabled tools * @return string a merged row consiting of the items in the first row * followed by those in the second. */ public function mergeRowValues($row_values1, $row_values2) { $row_values1_pos = 0; $row_values1_count = vByteDecode($row_values1, $row_values1_pos); $row_values2_pos = 0; $row_values2_count = vByteDecode($row_values2, $row_values2_pos); return vByteEncode($row_values1_count + $row_values2_count) . substr($row_values1, $row_values1_pos) . substr($row_values2, $row_values2_pos); } /** * Removes $key and any records associated with it from $table * * @param array& $table array of key => records pairs where the * key is a string of length given by this PackedTableTool's signature * and the records are packed according this PackedTableTool's signature * @param string $key to remove records for * @return bool success or failure of removal */ public function delete(&$table, $key) { if (!empty($table[$key])) { unset($table[$key]); return true; } return false; } /** * Return any records in $table associated with $key * * @param array $table array of key => records pairs where the * key is a string of length given by this PackedTableTool's signature * and the records are packed according this PackedTableTool's signature * @param string $key to return records for * @return array of records that have been packed according to this * PackedTableTool's signature */ public function find($table, $key) { return $table[$key] ?? null; } /** * */ public function load($table_path, $mode = self::REPLACE_MODE) { if (!file_exists($table_path)) { return null; } $pre_table = file_get_contents($table_path); $table = $this->compressor->uncompress($pre_table); $pre_rows = explode("\xFF", $table); $rows = []; $fixed_length_key = ($this->key_len > 0); $replace_mode = ($mode == self::REPLACE_MODE); $default_key_len = $this->key_len; $key_offset = $fixed_length_key ? 0 : 1; foreach ($pre_rows as $pre_row) { $entry = decode255($pre_row); $key_len = ($fixed_length_key) ? $default_key_len : ord($entry[0]); $key = substr($entry, $key_offset, $key_len); $new_values = substr($entry, $key_len + $key_offset); $rows[$key] = ($replace_mode || empty($rows[$key])) ? $new_values : $this->mergeRowValues($rows[$key], $new_values); } return $rows; } /** * */ public function pack($items) { $format = $this->format; if (!isset($items[0])) { $items = [$items]; } $packed_items = vByteEncode(count($items)); foreach ($items as $item) { $bool_column_data = ""; $cur_bool_int = -1; $cur_bool_shift = 7; $int_column_types = ""; $text_column_lengths = ""; $shift = 6; $cur_int_char = 0; $int_occurred = false; $packed_data = ""; foreach ($format as $field_name => $type) { if (!isset($item[$field_name])) { return null; } switch ($type) { case "BOOL": $cur_bool_int = ($cur_bool_int == -1) ? 0 : $cur_bool_int; $bool_value = ($item[$field_name]) ? 1 : 0; $cur_bool_int += ($bool_value << $cur_bool_shift); $cur_bool_shift--; if ($cur_bool_shift < 0) { $bool_column_data .= chr($cur_bool_int); $cur_bool_shift = 7; $cur_bool_int = -1; } break; case "DOUBLE": $packed_data .= pack("E", $item[$field_name]); break; case "INT": $int_occurred = true; $magnitude = abs($item[$field_name]); $positive = ($item[$field_name] == $magnitude) ? 0 : 128; if ($magnitude < 128) { $packed_data .= chr($magnitude + $positive); $cur_int_char = ($cur_int_char == -1) ? 0 : $cur_int_char; } else { if ($magnitude < 32768) { $packed_int = pack("n", $magnitude); $cur_int_add = (1 << $shift); } else if ($magnitude < 2147483647) { $packed_int = pack("N", $magnitude); $cur_int_add = (2 << $shift); } else { $packed_int = pack("J", $magnitude); $cur_int_add = (3 << $shift); } if (!$positive) { $packed_int[0] = chr(ord($packed_int[0]) + $positive); } $packed_data .= $packed_int; $cur_int_char = max($cur_int_char, 0) + $cur_int_add; } $shift -= 2; if ($shift < 0) { $int_column_types .= chr($cur_int_char); $cur_int_char = -1; $shift = 6; } break; case "REAL": $packed_data .= pack("G", $item[$field_name]); break; case "TEXT": $len = strlen($item[$field_name]); if ($len > 255) { return null; } $packed_data .= $item[$field_name]; $text_column_lengths .= chr($len); break; } } if ($cur_bool_int != -1) { $bool_column_data .= chr($cur_bool_int); } if ($cur_int_char != -1 && $int_occurred) { $int_column_types .= chr($cur_int_char); } $packed_items .= $bool_column_data . $int_column_types . $text_column_lengths . $packed_data; } return $packed_items; } /** * @return bool */ public function save($table_path, $table) { $out = ""; if ($this->key_len > 0) { foreach ($table as $key => $row) { $out .= "\xFF" . encode255($key . $row); } } else { foreach ($table as $key => $row) { $out .= "\xFF" . encode255(chr(strlen($key)) . $key . $row); } } $out = $this->compressor->compress(ltrim($out, "\xFF")); return (file_put_contents($table_path, $out) > 0); } /** * Given a table_row, which might represent several items grouped because * share a key, returns the total number of items stored in the row * @param string $table_row sequence of items packed according to this * PackedTableTools signature * @return int number of items sctored in the table row */ public function count($table_row) { $current_pos = 0; return vByteDecode($table_row, $current_pos); } /** * Given a table_row, which might represent several items grouped because * share a key, unpacks and returns the $offset through $limit numbered * items * @param string $table_row sequence of items packed according to this * PackedTableTools signature * @param int $offset index of item to begin with * @param int $limit maximum number of items to return starting at offset * @return array unplacked items */ public function unpack($table_row, $offset = 0, $limit = -1) { $num_int_columns = $this->num_int_columns; $num_text_columns = $this->num_text_columns; $num_bool_columns = $this->num_bool_columns; $bool_info_len = intval(ceil($this->num_bool_columns/8)); $int_info_len = intval(ceil($num_int_columns / 4)); $unpack_code = [1 => "n", 2=> "N", 3 => "J"]; $current_pos = 0; $num_items = vByteDecode($table_row, $current_pos); if ($offset >= $num_items) { return []; } $limit = ($limit <= 0) ? $num_items : $limit; $num_items = min($limit + $offset, $num_items); for ($i = 0; $i < $num_items; $i++) { $bool_info = substr($table_row, $current_pos, $bool_info_len); $current_pos += $bool_info_len; $int_info = substr($table_row, $current_pos, $int_info_len); $current_pos += $int_info_len; $text_info = substr($table_row, $current_pos, $num_text_columns); $current_pos += $num_text_columns; $format = $this->format; $item = []; $current_bool_pos = 0; $cur_bool_shift = 7; $current_int_pos = 0; $current_text_pos = 0; $shift = 6; $bools_used = 0; $ints_used = 0; foreach ($format as $field_name => $type) { switch ($type) { case "BOOL": if ($bools_used >= $num_bool_columns) { return null; } $item[$field_name] = ((ord($bool_info[$current_bool_pos]) & (1 << $cur_bool_shift)) >> $cur_bool_shift) > 0; $bools_used ++; $cur_bool_shift --; if ($cur_bool_shift < 0) { $current_bool_pos ++; $cur_bool_shift = 7; } break; case "DOUBLE": $item[$field_name] = unpack("E", substr($table_row, $current_pos, 8))[1]; $current_pos += 8; break; case "INT": if ($ints_used >= $num_int_columns) { return null; } $int_code = (ord($int_info[$current_int_pos]) & (3 << $shift)) >> $shift; if (!isset($table_row[$current_pos])) { return null; } $first_char = ord($table_row[$current_pos]); $len = 1 << $int_code; if ($int_code == 0) { $value = $first_char; if ($value > 127) { $value = - ($value & 127); } $item[$field_name] = $value; } else { $sign = ($first_char > 127) ? -1 : 1; if ($sign < 0) { $table_row[$current_pos] = chr($first_char - 128); } $pre_int = substr($table_row, $current_pos, $len); $item[$field_name] = $sign * (unpack( $unpack_code[$int_code], $pre_int)[1]); } $current_pos += $len; $ints_used++; $shift -= 2; if ($shift < 0) { $current_int_pos++; $shift = 6; } break; case "REAL": $item[$field_name] = unpack("G", substr($table_row, $current_pos, 4))[1]; $current_pos += 4; break; case "TEXT": if ($current_text_pos >= $num_text_columns) { return null; } $text_len = ord($text_info[$current_text_pos]); $item[$field_name] = substr($table_row, $current_pos, $text_len); $current_pos += $text_len; $current_text_pos++; break; } } if ($i >= $offset) { $items[] = $item; } } return $items; } }