fixes word docs len calc, a=chris

Chris Pollett [2011-08-01 05:Aug:st]
fixes word docs len calc, a=chris
Filename
bin/fetcher.php
bin/queue_server.php
lib/index_archive_bundle.php
lib/index_dictionary.php
lib/index_shard.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 1342aea94..86b377874 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -39,7 +39,7 @@ define("BASE_DIR", substr(
     dirname(realpath($_SERVER['PHP_SELF'])), 0,
     -strlen("/bin")));

-ini_set("memory_limit","850M"); //so have enough memory to crawl big pages
+ini_set("memory_limit","600M"); //so have enough memory to crawl big pages

 /** Load in global configuration settings */
 require_once BASE_DIR.'/configs/config.php';
diff --git a/bin/queue_server.php b/bin/queue_server.php
index b6a8e75d6..ce782d0cd 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -38,7 +38,7 @@ define("BASE_DIR", substr(
     dirname(realpath($_SERVER['PHP_SELF'])), 0,
     -strlen("/bin")));

-ini_set("memory_limit","1200M"); //so have enough memory to crawl big pages
+ini_set("memory_limit","1000M"); //so have enough memory to crawl big pages

 /** Load in global configuration settings */
 require_once BASE_DIR.'/configs/config.php';
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index a3327fcea..fb14f7932 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -266,7 +266,7 @@ class IndexArchiveBundle implements CrawlConstants
            in case merge tiers after adding to dictionary
         */
         $this->current_shard = new IndexShard(
-            $current_index_shard_file, $this->generation_info['ACTIVE'],
+            $current_index_shard_file, $this->generation_info['ACTIVE'],
                 $this->num_docs_per_generation, true);
         $this->dictionary->addShardDictionary($this->current_shard);
     }
diff --git a/lib/index_dictionary.php b/lib/index_dictionary.php
index 5af0ebd71..836fa16b0 100644
--- a/lib/index_dictionary.php
+++ b/lib/index_dictionary.php
@@ -164,7 +164,7 @@ class IndexDictionary implements CrawlConstants
      * @param object $index_shard the shard to add the word to the dictionary
      *      with
      */
-    function addShardDictionary(&$index_shard)
+    function addShardDictionary($index_shard)
     {
         $out_slot = "A";
         if(file_exists($this->dir_name."/0/0A.dic")) {
@@ -193,11 +193,11 @@ class IndexDictionary implements CrawlConstants
                         $first_offset_flag = false;
                     }
                     $offset -= $first_offset;
-                    $out = pack("N", $offset).pack("N", $count);
+                    $out = pack("N", $offset) . pack("N", $count);
                     $last_set = $j;
                     $last_out = $prefix_info;
                     charCopy($out, $prefix_string,
-                        (($i << 8) + $j)*self::PREFIX_ITEM_SIZE,
+                        (($i << 8) + $j) * self::PREFIX_ITEM_SIZE,
                         self::PREFIX_ITEM_SIZE);
                 }
             }
@@ -210,7 +210,7 @@ class IndexDictionary implements CrawlConstants
             if($last_set >= 0) {
                 list($offset, $count) = $last_out;
                 $next_offset = $base_offset + $offset +
-                    $count*IndexShard::WORD_ITEM_LEN;
+                    $count * IndexShard::WORD_ITEM_LEN;
                 fwrite($fh, $index_shard->getShardSubstring($last_offset,
                     $next_offset - $last_offset));
             }
@@ -595,7 +595,10 @@ class IndexDictionary implements CrawlConstants
      }

     /**
-     *
+     * Looks up the shard information (which is actually embedded in
+     * the dictionary) for a info:url query
+     * @param string $hash_info_url hash of info:url meta word
+     * @return array summary (to the extent stoed in a shard) data for this url
      */
     function getInfoItem($hash_info_url)
     {
diff --git a/lib/index_shard.php b/lib/index_shard.php
index 2d0170235..bf6cf0c04 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -535,7 +535,7 @@ class IndexShard extends PersistentStructure implements

         $num_docs_so_far = 0;
         $results = array();
-        $end = min($this->word_docs_len, $last_offset);
+        $end = min($this->file_len - $this->docids_len, $last_offset);
         $num_docs_or_links =
             self::numDocsOrLinks($start_offset, $last_offset);

@@ -1504,9 +1504,7 @@ class IndexShard extends PersistentStructure implements
     function getDocInfoSubstring($offset, $len)
     {
         if($this->read_only_from_disk) {
-            $base_offset = self::HEADER_LENGTH +
-                $this->prefixes_len + $this->words_len + $this->word_docs_len;
-
+            $base_offset = $this->file_len - $this->docids_len;
             return $this->getShardSubstring($base_offset + $offset, $len);
         }
         return substr($this->doc_infos, $offset, $len);
ViewGit