Fixes an issue with unset summary offsets in shard after deduplication on queue_server, a=chris

Chris Pollett [2010-10-17 20:Oct:th]
Fixes an issue with unset summary offsets in shard after deduplication on queue_server, a=chris
Filename
bin/fetcher.php
lib/crawl_constants.php
lib/index_archive_bundle.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/index_bundle_iterator.php
lib/index_bundle_iterators/intersect_iterator.php
lib/index_bundle_iterators/phrase_filter_iterator.php
lib/index_bundle_iterators/union_iterator.php
lib/index_bundle_iterators/word_iterator.php
lib/index_shard.php
lib/indexing_constants.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 14be82124..0051b5caf 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -1157,16 +1157,17 @@ class Fetcher implements CrawlConstants
                         mb_ereg_replace("[[:punct:]]", " ", $link_text);
                     $link_word_counts =
                         PhraseParser::extractPhrasesAndCount($link_text);
-                    $link_shard->addDocumentWords($link_key, 0,
+                    $link_shard->addDocumentWords($link_key,
+                        self::NEEDS_OFFSET_FLAG,
                         $link_word_counts, array());

                     $meta_ids[] = 'link:'.$url;
                 }

             }
-            $index_shard->addDocumentWords($doc_key, 0, $word_counts,
-                $meta_ids);
-
+            $index_shard->addDocumentWords($doc_key, self::NEEDS_OFFSET_FLAG,
+                $word_counts, $meta_ids);
+
             $index_shard->appendIndexShard($link_shard);

         }
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 7f69091dd..87ff70ccc 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -135,5 +135,8 @@ interface CrawlConstants
     const RELEVANCE ='an';
     const DUPLICATE ='ao';
     const META_WORDS ='ap';
+
+    const NEEDS_OFFSET_FLAG = 0x7FFFFFFE;
+    const DUPLICATE_FLAG = 0x7FFFFFFF;
 }
 ?>
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 5c55f159e..2835f5e95 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -54,11 +54,6 @@ require_once 'utility.php';
  */
 require_once 'crawl_constants.php';

-/**
- *Loads common constants for word indexing
- */
-require_once 'indexing_constants.php';
-

 /**
  * Encapsulates a set of web page summaries and an inverted word-index of terms
@@ -81,7 +76,7 @@ require_once 'indexing_constants.php';
  * @package seek_quarry
  * @subpackage library
  */
-class IndexArchiveBundle implements IndexingConstants, CrawlConstants
+class IndexArchiveBundle implements CrawlConstants
 {

     /**
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index d5ace8fa8..122e5d016 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -33,11 +33,6 @@

 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

-/**
- *Loads common constants for word indexing
- */
-require_once BASE_DIR.'/lib/indexing_constants.php';
-
 /**
  *Loads base class for iterating
  */
diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php
index 9bb7fec8c..dbef9bc3b 100644
--- a/lib/index_bundle_iterators/index_bundle_iterator.php
+++ b/lib/index_bundle_iterators/index_bundle_iterator.php
@@ -33,11 +33,6 @@

 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

-/**
- *Loads common constants for word indexing
- */
-require_once BASE_DIR.'/lib/indexing_constants.php';
-
 /**
  * Abstract classed used to model iterating documents indexed in
  * an IndexArchiveBundle or set of such bundles.
@@ -48,7 +43,7 @@ require_once BASE_DIR.'/lib/indexing_constants.php';
  * @subpackage iterator
  * @see IndexArchiveBundle
  */
-abstract class IndexBundleIterator implements IndexingConstants, CrawlConstants
+abstract class IndexBundleIterator implements CrawlConstants
 {

     /**
diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
index 6bfa74121..d498461dd 100644
--- a/lib/index_bundle_iterators/intersect_iterator.php
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -33,11 +33,6 @@

 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

-/**
- *Loads common constants for word indexing
- */
-require_once BASE_DIR.'/lib/indexing_constants.php';
-
 /**
  *Loads BloomFilterFile to remember things we've already grouped
  */
diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php
index a844c3e9d..fc7fa5fc6 100644
--- a/lib/index_bundle_iterators/phrase_filter_iterator.php
+++ b/lib/index_bundle_iterators/phrase_filter_iterator.php
@@ -33,11 +33,6 @@

 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

-/**
- *Loads common constants for word indexing
- */
-require_once BASE_DIR.'/lib/indexing_constants.php';
-
 /**
  *Loads base class for iterating
  */
diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php
index 4c79ec0a0..9f3df46b7 100644
--- a/lib/index_bundle_iterators/union_iterator.php
+++ b/lib/index_bundle_iterators/union_iterator.php
@@ -33,11 +33,6 @@

 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

-/**
- *Loads common constants for word indexing
- */
-require_once BASE_DIR.'/lib/indexing_constants.php';
-
 /**
  *Loads BloomFilterFile to remember things we've already grouped
  */
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index a1e31358d..93a5fa321 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -33,11 +33,6 @@

 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

-/**
- *Loads common constants for word indexing
- */
-require_once BASE_DIR.'/lib/indexing_constants.php';
-
 /**
  *Loads base class for iterating
  */
diff --git a/lib/index_shard.php b/lib/index_shard.php
index 019c18e21..c281cc0ff 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -215,7 +215,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                 $previous_string = substr($value, 4, 4);
                 $count_array = unpack("N", substr($value, 8, 4));
                 $count =  $count_array[1];
-                if($count == 0x7FFFFFFF) { continue; }
+                if($count == self::DUPLICATE_FLAG) { continue; }
                 $count++;
                 $value = $first_string . pack("N", $this->word_docs_len) .
                     pack("N", $count);
@@ -332,7 +332,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             $is_doc = false;
             $skip_stats = false;

-            if($item[self::SUMMARY_OFFSET] == 0x7FFFFFFF) {
+            if($item[self::SUMMARY_OFFSET] == self::DUPLICATE_FLAG ||
+                $item[self::SUMMARY_OFFSET] == self::NEEDS_OFFSET_FLAG) {
                 $skip_stats = true;
                 $item[self::DUPLICATE] = true;
             } else if(($tmp[1] & self::COMPOSITE_ID_FLAG) !== 0) {
@@ -444,8 +445,11 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                 $count_string = substr($value, 8, 4);
                 $tmp = unpack("N", $count_string);
                 $count = $tmp[1];
-                if($count == 0x7FFFFFFF || $add_count == 0x7FFFFFFF) {
-                    $new_count = 0x7FFFFFFF;
+                if($count == self::DUPLICATE_FLAG ||
+                    $count == self::NEEDS_OFFSET_FLAG ||
+                    $add_count == self::DUPLICATE_FLAG ||
+                    $add_count == self::NEEDS_OFFSET_FLAG) {
+                    $new_count =self::DUPLICATE_FLAG;
                 } else {
                     $new_count = $count + $add_count;
                 }
@@ -487,7 +491,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             $id = substr($this->doc_infos, $i, 8);
             $tmp = unpack("N", substr($this->doc_infos, $i + 8, 4));
             $offset = $tmp[1];
-            if($offset == 0x7FFFFFFF) {continue; }//ignore duplicates
+            if($offset == self::DUPLICATE_FLAG) {continue; }//ignore duplicates
+                //notice don't ignore NEEDS_OFFSET_FLAG
             $comp_flag = 0;
             if(($offset & self::COMPOSITE_ID_FLAG) !== 0) {
                 //handle link item case
@@ -507,8 +512,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     /**
      * Marks a set of urls as duplicates of urls previously seen
      * To do this the url's doc_id has associated with a summary
-     * offset of value 0x7FFFFFFF, and its length is set to
-     * 0XFFFFFFFF
+     * offset of value 0x7FFFFFFF (CrawlConstants::DUPLICATE_FLAG), and its
+     * length is set to 0XFFFFFFFF
      *
      * @param array $doc_urls urls to mark as duplicates.
      */
@@ -516,12 +521,12 @@ class IndexShard extends PersistentStructure implements CrawlConstants
     {
         foreach($doc_urls as $duplicate) {
             $doc_key = crawlHash($duplicate, true);
-            $this->doc_infos .= $doc_key . pack("N", 0x7FFFFFFF).
+            $this->doc_infos .= $doc_key . pack("N", self::DUPLICATE_FLAG).
                 pack("N", 0xFFFFFFFF);
             $word_key = crawlHash("info:".$duplicate, true);
             $this->word_docs .= pack("N", ($this->docids_len<< 4)).pack("N",0);
             $tmp = pack("N", $this->word_docs_len);
-            $this->words[$word_key] = $tmp.$tmp.pack("N", 0x7FFFFFFF);
+            $this->words[$word_key] = $tmp.$tmp.pack("N", self::DUPLICATE_FLAG);
             $this->word_docs_len += 8;
             $this->docids_len += 16;
         }
diff --git a/lib/indexing_constants.php b/lib/indexing_constants.php
deleted file mode 100644
index ba5348276..000000000
--- a/lib/indexing_constants.php
+++ /dev/null
@@ -1,58 +0,0 @@
-<?php
-/**
- *  SeekQuarry/Yioop --
- *  Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
- *
- *  LICENSE:
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  END LICENSE
- *
- * @author Chris Pollett chris@pollett.org
- * @package seek_quarry
- * @subpackage library
- * @license http://www.gnu.org/licenses/ GPL3
- * @link http://www.seekquarry.com/
- * @copyright 2009, 2010
- * @filesource
- */
-
-if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
-
-/**
- * Enumerative interface for common constants between WordIterator and
- * IndexArchiveBundle
- *
- * These constants are used as fields in arrays. They are negative to
- * distinguish them from normal array elements 0, 1, 2... However, this
- * means you need to be slightly careful if you try to sort the array
- * as this might screw things up
- *
- * @author Chris Pollett
- * @package seek_quarry
- * @subpackage library
- */
-interface IndexingConstants
-{
-    const COUNT = -1;
-    const END_BLOCK = -2;
-    const LIST_OFFSET = -3;
-    const POINT_BLOCK = -4;
-    const PARTIAL_COUNT = -5;
-    const NAME = -6;
-}
-?>
ViewGit