diff --git a/bin/fetcher.php b/bin/fetcher.php
index 85f258359..85b0dead5 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -515,7 +515,7 @@ class Fetcher implements CrawlConstants
}
} else if ($this->crawl_time > 0) { /* case(3) */
// Either a web crawl or a recrawl of a previous web crawl.
- crawlLog("MAIN LOOP CASE 3 -- WEB SCHEDULER");
+ crawlLog("MAIN LOOP CASE 3 -- WEB/RE-CRAWL SCHEDULER");
$info = $this->checkScheduler();
if($info === false) {
@@ -707,6 +707,7 @@ class Fetcher implements CrawlConstants
crawlLog(" participate in a web archive recrawl!!");
return $pages;
} else {
+ crawlLog("Initializing Web Archive Bundle Iterator.");
$this->archive_iterator =
new WebArchiveBundleIterator($prefix, $this->crawl_index,
$this->crawl_time);
@@ -924,7 +925,7 @@ class Fetcher implements CrawlConstants
$to_crawl_count = count($this->to_crawl);
$to_crawl_again_count = count($this->to_crawl_again);
if($this->recrawl_check_scheduler) {
- crawlLog("Arc Crawl checking scheduler??");
+ crawlLog("Archive Crawl checking ... Recrawl.");
}
if((count($this->to_crawl) > 0 || count($this->to_crawl_again) > 0) &&
(!$this->recrawl_check_scheduler)) {
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 249e5b418..0daaa2300 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -1500,7 +1500,7 @@ class QueueServer implements CrawlConstants, Join
" time: ".(changeInMicrotime($start_time)));
$start_time = microtime();
- $this->index_archive->addIndexData($index_shard, $this);
+ $this->index_archive->addIndexData($index_shard);
$this->index_dirty = true;
}
crawlLog("D (add index shard) memory usage".memory_get_usage().
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
index bd5ce6bbb..e8a6290a7 100644
--- a/lib/index_bundle_iterators/word_iterator.php
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -163,13 +163,11 @@ class WordIterator extends IndexBundleIterator
$word_key = base64_decode($hash);
}
-
if($filter != NULL) {
$this->filter = & $filter;
} else {
$this->filter = NULL;
}
-
$this->word_key = $word_key;
$this->feed_shard_name = WORK_DIRECTORY."/feeds/index";
diff --git a/lib/indexing_plugins/recipe_plugin.php b/lib/indexing_plugins/recipe_plugin.php
index b7c354434..db8ecd0a4 100644
--- a/lib/indexing_plugins/recipe_plugin.php
+++ b/lib/indexing_plugins/recipe_plugin.php
@@ -116,7 +116,8 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
$xpath = new DOMXPath($dom);
$recipes_per_page = $xpath->evaluate(
- /*allr, f.com, brec, fnet*/ "/html//ul[@class = 'ingredient-wrap'] |
+ /*allr, f.com, brec, fnet*/
+ "/html//ul[@class = 'ingredient-wrap'] |
/html//*[@class = 'pod ingredients'] |
/html//*[@id='recipe_title'] |
/html//div[@class = 'rcp-head clrfix']|
@@ -126,14 +127,16 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
if(is_object($recipes_per_page) && $recipes_per_page->length != 0) {
$recipes_count = $recipes_per_page->length;
$titles = $xpath->evaluate(
- /* allr, f.com, brec, fnet */ "/html//*[@id = 'itemTitle']|
+ /* allr, f.com, brec, fnet */
+ "/html//*[@id = 'itemTitle']|
/html//h1[@class = 'fn'] |
/html//*[@id='recipe_title'] |
/html//div[@class ='rcp-head clrfix']/h1 |
/html//h1[@class = 'fn recipeDetailHeading']");
for($i=0; $i < $recipes_count; $i++) {
$ingredients = $xpath->evaluate(
- /*allr*, fcomm, brec, fnet*/ "/html//ul[@class = 'ingredient-wrap']/li |
+ /*allr*, fcomm, brec, fnet*/
+ "/html//ul[@class = 'ingredient-wrap']/li |
/html//li[@class = 'ingredient']|
/html//*[@class = 'ingredients']/*|
/html//*[@itemprop='ingredients']
@@ -185,7 +188,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
$more_docs = true;
$raw_recipes = array();
$limit = 0;
- $num = 10;
+ $num = 100;
while($more_docs) {
$results = @$search_controller->queryRequest($query,
$num, $limit, 1, $index_name);
@@ -327,7 +330,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
$clusters = kruskalClustering($weights,
$count, $distinct_ingredients);
$index_shard = new IndexShard("cluster_shard");
- $word_counts = array();
+ $word_lists = array();
$recipe_sites = array();
foreach($clusters as $cluster) {
@@ -336,7 +339,6 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
$meta_ids = array();
$summary = array();
$recipe = $cluster[$i];
- $doc_key = $doc_keys[$recipe];
$summary[self::URL] =
$recipes_summary[$recipe][self::URL];
$summary[self::TITLE] =
@@ -349,26 +351,42 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
$recipes_summary[$recipe][self::ENCODING];
$summary[self::HASH] =
$recipes_summary[$recipe][self::HASH];
+ $doc_keys[$recipe] =
+ crawlHash($summary[self::URL], true);
+ $hash_rhost = "r". substr(crawlHash( // r is for recipe
+ UrlParser::getHost($summary[self::URL])."/",true), 1);
+ $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
$summary[self::TYPE] =
$recipes_summary[$recipe][self::TYPE];
$summary[self::HTTP_CODE] =
$recipes_summary[$recipe][self::HTTP_CODE];
$recipe_sites[] = $summary;
- $meta_ids[] = "ingredient:".$cluster["ingredient"];
- $index_shard->addDocumentWords($doc_key,
+ $meta_ids[] = "ingredient:".trim($cluster["ingredient"]);
+ crawlLog("ingredient:".$cluster["ingredient"]);
+ if(!$index_shard->addDocumentWords($doc_keys[$recipe],
self::NEEDS_OFFSET_FLAG,
- $word_counts, $meta_ids, true, false);
- $index_shard->save(true);
+ $word_lists, $meta_ids, true, false)) {
+ crawlLog("Problem inserting recipe: ".
+ $summary[self::TITLE]);
+ }
}
}
+ $shard_string = $index_shard->save(true);
+ $index_shard = IndexShard::load("cluster_shard",
+ $shard_string);
+ unset($shard_string);
crawlLog("...Adding recipe shard to index archive bundle");
$dir = CRAWL_DIR."/cache/".self::index_data_base_name.$index_name;
$index_archive = new IndexArchiveBundle($dir, false);
+ if($index_shard->word_docs_packed) {
+ $index_shard->unpackWordDocs();
+ }
$generation = $index_archive->initGenerationToAdd($index_shard);
if(isset($recipe_sites)) {
+ crawlLog("... Adding ".count($recipe_sites)." recipe docs.");
$index_archive->addPages($generation,
self::SUMMARY_OFFSET, $recipe_sites, 0);
}
@@ -379,8 +397,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants
$site[self::HASH] .
"r". substr(crawlHash( // r is for recipe
UrlParser::getHost($site[self::URL])."/",true), 1);
- $summary_offsets[$hash] =
- array($site[self::SUMMARY_OFFSET], null);
+ $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
}
$index_shard->changeDocumentOffsets($summary_offsets);
$index_archive->addIndexData($index_shard);