Gets the recipe plugin working again, a=chris

Chris Pollett [2013-03-06 02:Mar:th]
Gets the recipe plugin working again, a=chris
Filename
bin/fetcher.php
bin/queue_server.php
lib/index_bundle_iterators/word_iterator.php
lib/indexing_plugins/recipe_plugin.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 85f258359..85b0dead5 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -515,7 +515,7 @@ class Fetcher implements CrawlConstants
                 }
             } else if ($this->crawl_time > 0) { /* case(3) */
                 // Either a web crawl or a recrawl of a previous web crawl.
-                crawlLog("MAIN LOOP CASE 3 -- WEB SCHEDULER");
+                crawlLog("MAIN LOOP CASE 3 -- WEB/RE-CRAWL SCHEDULER");
                 $info = $this->checkScheduler();

                 if($info === false) {
@@ -707,6 +707,7 @@ class Fetcher implements CrawlConstants
                 crawlLog("  participate in a web archive recrawl!!");
                 return $pages;
             } else {
+                crawlLog("Initializing Web Archive Bundle Iterator.");
                 $this->archive_iterator =
                     new WebArchiveBundleIterator($prefix, $this->crawl_index,
                         $this->crawl_time);
@@ -924,7 +925,7 @@ class Fetcher implements CrawlConstants
         $to_crawl_count = count($this->to_crawl);
         $to_crawl_again_count = count($this->to_crawl_again);
         if($this->recrawl_check_scheduler) {
-            crawlLog("Arc Crawl checking scheduler??");
+            crawlLog("Archive Crawl checking ... Recrawl.");
         }
         if((count($this->to_crawl) > 0 || count($this->to_crawl_again) > 0) &&
            (!$this->recrawl_check_scheduler)) {
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 249e5b418..0daaa2300 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -1500,7 +1500,7 @@ class QueueServer implements CrawlConstants, Join
                 " time: ".(changeInMicrotime($start_time)));
             $start_time = microtime();

-            $this->index_archive->addIndexData($index_shard, $this);
+            $this->index_archive->addIndexData($index_shard);
             $this->index_dirty = true;
         }
         crawlLog("D (add index shard) memory usage".memory_get_usage().
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index bd5ce6bbb..e8a6290a7 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -163,13 +163,11 @@ class WordIterator extends IndexBundleIterator
             $word_key = base64_decode($hash);

         }
-
         if($filter != NULL) {
             $this->filter = & $filter;
         } else {
             $this->filter = NULL;
         }
-
         $this->word_key = $word_key;

         $this->feed_shard_name = WORK_DIRECTORY."/feeds/index";
diff --git a/lib/indexing_plugins/recipe_plugin.php b/lib/indexing_plugins/recipe_plugin.php
index b7c354434..db8ecd0a4 100644
--- a/lib/indexing_plugins/recipe_plugin.php
+++ b/lib/indexing_plugins/recipe_plugin.php
@@ -116,7 +116,8 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants

         $xpath = new DOMXPath($dom);
         $recipes_per_page = $xpath->evaluate(
- /*allr, f.com, brec, fnet*/ "/html//ul[@class = 'ingredient-wrap'] |
+            /*allr, f.com, brec, fnet*/
+            "/html//ul[@class = 'ingredient-wrap'] |
             /html//*[@class = 'pod ingredients'] |
             /html//*[@id='recipe_title'] |
             /html//div[@class = 'rcp-head clrfix']|
@@ -126,14 +127,16 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
         if(is_object($recipes_per_page) && $recipes_per_page->length != 0) {
             $recipes_count = $recipes_per_page->length;
             $titles = $xpath->evaluate(
- /* allr, f.com, brec, fnet   */ "/html//*[@id = 'itemTitle']|
+               /* allr, f.com, brec, fnet   */
+               "/html//*[@id = 'itemTitle']|
                /html//h1[@class = 'fn'] |
                /html//*[@id='recipe_title'] |
                /html//div[@class ='rcp-head clrfix']/h1 |
                /html//h1[@class = 'fn recipeDetailHeading']");
             for($i=0; $i < $recipes_count; $i++) {
                 $ingredients = $xpath->evaluate(
- /*allr*, fcomm, brec, fnet*/    "/html//ul[@class = 'ingredient-wrap']/li |
+                    /*allr*, fcomm, brec, fnet*/
+                    "/html//ul[@class = 'ingredient-wrap']/li |
                     /html//li[@class = 'ingredient']|
                     /html//*[@class = 'ingredients']/*|
                     /html//*[@itemprop='ingredients']
@@ -185,7 +188,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
         $more_docs = true;
         $raw_recipes = array();
         $limit = 0;
-        $num = 10;
+        $num = 100;
         while($more_docs) {
             $results = @$search_controller->queryRequest($query,
                 $num, $limit, 1, $index_name);
@@ -327,7 +330,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
             $clusters = kruskalClustering($weights,
                 $count, $distinct_ingredients);
             $index_shard = new IndexShard("cluster_shard");
-            $word_counts = array();
+            $word_lists = array();
             $recipe_sites = array();

             foreach($clusters as $cluster) {
@@ -336,7 +339,6 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
                     $meta_ids = array();
                     $summary = array();
                     $recipe = $cluster[$i];
-                    $doc_key = $doc_keys[$recipe];
                     $summary[self::URL] =
                         $recipes_summary[$recipe][self::URL];
                     $summary[self::TITLE] =
@@ -349,26 +351,42 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
                         $recipes_summary[$recipe][self::ENCODING];
                     $summary[self::HASH] =
                         $recipes_summary[$recipe][self::HASH];
+                    $doc_keys[$recipe] =
+                        crawlHash($summary[self::URL], true);
+                    $hash_rhost =  "r". substr(crawlHash( // r is for recipe
+                        UrlParser::getHost($summary[self::URL])."/",true), 1);
+                    $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
                     $summary[self::TYPE] =
                         $recipes_summary[$recipe][self::TYPE];
                     $summary[self::HTTP_CODE] =
                         $recipes_summary[$recipe][self::HTTP_CODE];
                     $recipe_sites[] = $summary;
-                    $meta_ids[] = "ingredient:".$cluster["ingredient"];
-                    $index_shard->addDocumentWords($doc_key,
+                    $meta_ids[] = "ingredient:".trim($cluster["ingredient"]);
+                    crawlLog("ingredient:".$cluster["ingredient"]);
+                    if(!$index_shard->addDocumentWords($doc_keys[$recipe],
                         self::NEEDS_OFFSET_FLAG,
-                        $word_counts, $meta_ids, true, false);
-                    $index_shard->save(true);
+                        $word_lists, $meta_ids, true, false)) {
+                        crawlLog("Problem inserting recipe: ".
+                            $summary[self::TITLE]);
+                    }
                 }

             }
+            $shard_string = $index_shard->save(true);
+            $index_shard = IndexShard::load("cluster_shard",
+                $shard_string);
+            unset($shard_string);

             crawlLog("...Adding recipe shard to index archive bundle");

             $dir = CRAWL_DIR."/cache/".self::index_data_base_name.$index_name;
             $index_archive = new IndexArchiveBundle($dir, false);
+            if($index_shard->word_docs_packed) {
+                $index_shard->unpackWordDocs();
+            }
             $generation = $index_archive->initGenerationToAdd($index_shard);
             if(isset($recipe_sites)) {
+                crawlLog("... Adding ".count($recipe_sites)." recipe docs.");
                 $index_archive->addPages($generation,
                     self::SUMMARY_OFFSET, $recipe_sites, 0);
             }
@@ -379,8 +397,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
                     $site[self::HASH] .
                     "r". substr(crawlHash( // r is for recipe
                     UrlParser::getHost($site[self::URL])."/",true), 1);
-                $summary_offsets[$hash] =
-                    array($site[self::SUMMARY_OFFSET], null);
+                $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
             }
             $index_shard->changeDocumentOffsets($summary_offsets);
             $index_archive->addIndexData($index_shard);
ViewGit