Fixes an offset bug in adding words to index caused by feeding in position lists will 0 in it which won't work with modified9, a=chris

Chris Pollett [2019-07-11 16:Jul:th]

Fixes an offset bug in adding words to index caused by feeding in position lists will 0 in it which won't work with modified9, a=chris

Filename
src/library/PhraseParser.php
src/library/Utility.php
src/library/WebArchive.php
src/library/WebArchiveBundle.php
src/library/summarizers/Summarizer.php
tests/UtilityTest.php

diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 082ab2ee5..f5a264a84 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -439,7 +439,7 @@ class PhraseParser
     /**
      * Splits string according to punctuation and white space then
      * extracts (stems/char grams) of terms and n word grams from the string
-     * Uses a notiona of maximal n word gram to dot eh extraction
+     * Uses a notion a of maximal n word gram to do the extraction
      *
      * @param string $string to extract terms from
      * @param string $lang IANA tag to look up stemmer under
@@ -452,34 +452,33 @@ class PhraseParser
         $lang = null, $extract_sentences = false)
     {
         $pos_lists = [];
-        $maximal_phrases = [];
         $terms = self::stemCharGramSegment($string, $lang);
-        if ($terms == []) {
+        if (empty($terms)) {
             return [];
         }
         if (C\SUFFIX_PHRASES == 'true') {
             $suffix_tree = new SuffixTree($terms);
-            $suffix_tree->outputMaximal(1, "", 0, $maximal_phrases);
+            $suffix_tree->outputMaximal(1, "", 0, $pos_lists);
         }
-        $t = 0;
-        $seen = [];
+        $t = 1; /*first position in doc is 1 as will encode with modified9
+             which requires positive numbers
+        */
         // add all single terms
         foreach ($terms as $term) {
-            if (!isset($seen[$term])) {
-                $seen[$term] = [];
-                $maximal_phrases[$term] = [];
+            if (!isset($pos_lists[$term])) {
+                $pos_lists[$term] = [];
             }
-            $maximal_phrases[$term][] = $t;
+            $pos_lists[$term][] = $t;
             $t++;
         }
-        $out["TERMS_AND_PHRASES"] = $maximal_phrases;
+        $out["TERMS_AND_PHRASES"] = $pos_lists;
         $tokenizer = self::getTokenizer($lang);
         if ($extract_sentences && method_exists($tokenizer,
             "tagTokenizePartOfSpeech") &&
             !isset(self::$programming_language_map[$lang])) {
             $string = mb_strtolower($string);
             $pre_sentences = preg_split("/(\n\n+)|\.|\!|\?|。/u", $string);
-            $pos = 0;
+            $pos = 1;
             $sentences_pos = [];
             $sentences = [];
             foreach ($pre_sentences as $pre_sentence) {
diff --git a/src/library/Utility.php b/src/library/Utility.php
index 4eb6a62cd..16a3e6226 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -388,7 +388,10 @@ function deDeltaList(&$delta_list)

 /**
  * Mini-class (so not own file) used to hold encode decode info related to
- * Mod9 encoding (as variant of Simplified-9 specify to Yioop)
+ * Mod9 encoding (as variant of Simplified-9 specify to Yioop).
+ * Mod9 is used to incode a sequence of positive (greater than 0) integers
+ * as a string. WARNING: do not expect is to work/decode correctly if
+ * sequence has a 0 as the decoding process assumes 0 indicates end of sequence.
  * @see encodeModified9 for a complete description
  */
 class Mod9Constants
@@ -433,7 +436,7 @@ class Mod9Constants
 }
 /**
  * Encodes a sequence of integers x, such that 1 <= x <= 2<<28-1
- * as a string.
+ * as a string. NOTICE x>=1.
  *
  * The encoded string is a sequence of 4 byte words (packed int's).
  * The high order 2 bits of a given word indicate whether or not
@@ -545,8 +548,7 @@ function nextPostString(&$input_string, &$offset)
         return "";
     }
     $end += 4;
-    while($end < $len &&
-            $flag_bits >= $continue_threshold) {
+    while($end < $len && $flag_bits >= $continue_threshold) {
         $flag_bits = (ord($input_string[$end]) & $flag_mask);
         $end += 4;
     }
@@ -574,7 +576,7 @@ function decodeModified9($input_string, &$offset)

 if (!extension_loaded("yioop") ) {
 /**
- * Decoded a single word with high two bits off according to modified 9
+ * Decode a single word with high two bits off according to modified 9
  *
  * @param string $encoded_list four byte string to decode
  * @return array sequence of integers that results from the decoding.
@@ -605,17 +607,20 @@ function unpackListModified9($encoded_list)
             $int_string = packInt($encoded_list);
             $first_char = ord($int_string[0]);
             foreach ($MOD9_NUM_BITS_CODES as $code => $num_bits) {
-                if (($first_char & $code) == $code) break;
+                if (($first_char & $code) == $code) {
+                    break;
+                }
             }
             $num_elts = $MOD9_NUM_ELTS_DECODES[$code];
             $mask = (1 << $num_bits) - 1;
             $int_string[0] = chr($first_char - $code);
             $encoded_list = unpackInt($int_string);
     }
-
     $decoded_list = [];
     for ($i = 0; $i < $num_elts; $i++) {
-        if (($pre_elt = $encoded_list & $mask) == 0) break;
+        if (($pre_elt = $encoded_list & $mask) == 0) {
+            break;
+        }
         array_unshift($decoded_list, $pre_elt);
         $encoded_list >>= $num_bits;
     }
diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php
index 4fecdded2..a7222f0c6 100755
--- a/src/library/WebArchive.php
+++ b/src/library/WebArchive.php
@@ -329,8 +329,12 @@ class WebArchive
         if ((!$is_string && fseek($fh, $offset) == 0 ) || ($is_string
             && $offset < $storage_len)) {
             for ($i = 0; $i < $num; $i++) {
-                if (!$is_string && feof($fh)) {break; }
-                if ($is_string && $offset >= $storage_len) {break; }
+                if (!$is_string && feof($fh)) {
+                    break;
+                }
+                if ($is_string && $offset >= $storage_len) {
+                    break;
+                }
                 $object = null;
                 $compressed_len = ($is_string)
                     ? substr($this->storage, $offset, $compressed_int_len)
diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php
index 6f0ef6960..f23b4cb1e 100755
--- a/src/library/WebArchiveBundle.php
+++ b/src/library/WebArchiveBundle.php
@@ -262,7 +262,7 @@ class WebArchiveBundle
             if (!$archive_name_exists) {
                 /* always add a dummy record so an offset 0 of a real record
                    can never be legit. This is just to be on the safe side
-                   if a changeDocumentOffsets in IndexShard happens not to work
+                   if a changeDocumentOffsets in IndexShard happens not to work.
                  */
                 $dummy_pages = [["DUMMY"]];
                 $this->partition[$index]->addObjects("DUMMY_OFFSET",
@@ -334,7 +334,7 @@ class WebArchiveBundle
             $info['NUM_DOCS_PER_PARTITION'] = -1;
             return $info;
         }
-        $info = unserialize(file_get_contents($dir_name."/description.txt"));
+        $info = unserialize(file_get_contents($dir_name . "/description.txt"));
         return $info;
     }
     /**
diff --git a/src/library/summarizers/Summarizer.php b/src/library/summarizers/Summarizer.php
index 65020eca5..55fa424a6 100644
--- a/src/library/summarizers/Summarizer.php
+++ b/src/library/summarizers/Summarizer.php
@@ -363,7 +363,9 @@ class Summarizer
         sort($summary_indices);
         $eos = ($lang == 'hi') ? "।" : "."; //default end of sentence symbol
         $summary_scores = [];
-        $score_pos = 0;
+        $score_pos = 1; /* Starting offset in docs always 1 not 0 so works with
+            modified9 encoding/decoding
+         */
         foreach ($summary_indices as $index) {
             $sentence = PhraseParser::compressSentence($sentences[$index],
                 $lang);
diff --git a/tests/UtilityTest.php b/tests/UtilityTest.php
index 9d47d2a4f..226c500dc 100644
--- a/tests/UtilityTest.php
+++ b/tests/UtilityTest.php
@@ -97,17 +97,17 @@ class UtilityTest extends UnitTest
         $packed = L\packPosting(33689, $posting_list);
         $out_doc_list = L\unpackPosting($packed, $offset, true);
         $this->assertEqual($out_doc_list[0], 33689,
-            "Doc index from unpack of first word has delta 0 case");
+            "Doc index from unpack of first word has delta[0] case");
         $this->assertEqual($out_doc_list[1], $posting_list,
-            "Unpack of delta 0 case");
+            "Unpack of delta[0] case");
         $offset = 0;
         $posting_list = [511, 12000, 24000];
         $packed = L\packPosting(33689, $posting_list);
         $out_doc_list = L\unpackPosting($packed, $offset, true);
         $this->assertEqual($out_doc_list[0], 33689,
-            "Doc index from unpack of first word has delta 0 case");
+            "Doc index from unpack of first word has delta[0] case 2");
         $this->assertEqual($out_doc_list[1], $posting_list,
-            "Unpack of delta 0 case");
+            "Unpack of delta[0] case 2");
         $posting_list = [6000, 12000, 24000];
         $packed = L\packPosting(100000, $posting_list);
         $offset = 0;
@@ -116,6 +116,15 @@ class UtilityTest extends UnitTest
             "Bigger Doc index from unpack of long packed posting equal");
         $this->assertEqual($out_doc_list[1], $posting_list,
             "Bigger Delta unpack of posting equal");
+        $posting_list = [1, 4, 7, 174];
+        $packed = L\packPosting(0, $posting_list);
+        $unpack_int = unpack("N*", $packed);
+        $offset = 0;
+        $out_doc_list = L\unpackPosting($packed, $offset, true);
+        $this->assertEqual($out_doc_list[0], 0,
+            "Doc index from unpack of doc index 0 case");
+        $this->assertEqual($out_doc_list[1], $posting_list,
+            "Unpack of doc index 0 case");
     }
     /**
      * Used to check if the functions to encode decode queue weight are

ViewGit