diff --git a/src/configs/Config.php b/src/configs/Config.php index 8361e7378..1f3c481e5 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -888,9 +888,9 @@ nsdefine('QUERY_IMPRESSION', 4); */ nsconddefine("ANALYTICS_UPDATE_INTERVAL", ONE_HOUR / 6); /** Value of epsilon in differential privacy formula */ -nsdefine('PRIVACY_EPSILON', 0.01); +nsconddefine('PRIVACY_EPSILON', 0.01); /** Flag to turn on/off differential privacy */ -nsdefine('DIFFERENTIAL_PRIVACY', false); +nsconddefine('DIFFERENTIAL_PRIVACY', false); /* * Database Field Sizes */ @@ -937,4 +937,4 @@ nsconddefine('AD_DATE_FORMAT','Y-m-d'); /** advertisement logo*/ nsconddefine('AD_LOGO','resources/adv-logo.png'); /** sentence compression enabled or not*/ -nsconddefine('SENTENCE_COMPRESSION_ENABLED', false); \ No newline at end of file +nsconddefine('SENTENCE_COMPRESSION_ENABLED', false); diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php index 68846421b..5b33af689 100755 --- a/src/controllers/SearchController.php +++ b/src/controllers/SearchController.php @@ -824,7 +824,6 @@ class SearchController extends Controller implements CrawlConstants $advertisement_model->addImpression( $data['RELEVANT_ADVERTISEMENT']['ID']); } - } } break; @@ -1165,9 +1164,10 @@ class SearchController extends Controller implements CrawlConstants public function queryRequest($query, $results_per_page, $limit = 0, $grouping = 0, $save_timestamp = 0, $limit_feeds = true) { - if (!C\API_ACCESS) {return null; } + if (!C\API_ACCESS) { + return null; + } $grouping = ($grouping > 0 ) ? 2 : 0; - $data = []; $this->processQuery($data, $query, "query", "", $results_per_page, $limit, 0, $grouping, $save_timestamp, $limit_feeds); diff --git a/src/examples/SearchApi.php b/src/examples/SearchApi.php index c8b1ba860..4884d2823 100644 --- a/src/examples/SearchApi.php +++ b/src/examples/SearchApi.php @@ -50,9 +50,7 @@ define("seekquarry\\yioop\\configs\\PARENT_DIR", define("seekquarry\\yioop\\configs\\BASE_DIR", C\PARENT_DIR . "/src"); /** Load in global configuration settings; you need this*/ require_once C\BASE_DIR.'/configs/Config.php'; -/** Load class autoloader*/ -require_once C\PARENT_DIR.'/vendor/autoload.php'; -if (!PROFILE) { +if (!C\PROFILE) { echo "Please configure the search engine instance by visiting" . "its web interface on localhost.\n"; exit(); diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 9ad57747b..0ba40bd69 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -586,7 +586,6 @@ class ArcTool implements CrawlConstants } else { echo "\n$path ...\n". " does not contain posting shards so cannot be re-indexed\n\n"; - } } /** @@ -1114,4 +1113,4 @@ EOD; } $arc_tool = new ArcTool(); -$arc_tool->start(); \ No newline at end of file +$arc_tool->start(); diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 0c3d1e198..9828385d0 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -1119,12 +1119,9 @@ class Fetcher implements CrawlConstants L\crawlLog("So not checking scheduler."); return true; } - $this->selectCurrentServerAndUpdateIfNeeded(false); - $this->recrawl_check_scheduler = false; $queue_server = $this->queue_servers[$this->current_server]; - L\crawlLog("Checking $queue_server for a new schedule."); // hosts with error counts cleared with each schedule $this->hosts_with_errors = []; @@ -1427,7 +1424,7 @@ class Fetcher implements CrawlConstants $this->plugin_processors = []; foreach ($info[self::INDEXING_PLUGINS] as $plugin) { if ($plugin == "") { continue; } - $plugin_name = C\NS_PLUGINS . $plugin."Plugin"; + $plugin_name = C\NS_PLUGINS . $plugin . "Plugin"; $processors = $plugin_name::getProcessors(); $plugin_object = new $plugin_name(); if (method_exists($plugin_name, "setConfiguration") && @@ -1874,7 +1871,6 @@ class Fetcher implements CrawlConstants } $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages); - $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && @@ -1980,6 +1976,9 @@ class Fetcher implements CrawlConstants $summarized_site_pages[$i][self::INDEX] = $num_items + $i; } foreach ($filter_stored as $stored) { + if (!isset($stored[self::INDEX]) ) { + continue; + } $i = $stored[self::INDEX]; if (isset($stored[self::OFFSET])) { $summarized_site_pages[$i][self::OFFSET] = diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index cdf8e649e..39adc724a 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -1089,7 +1089,6 @@ class QueueServer implements CrawlConstants, Join { //to get here we at least have to have a crawl_time $this->crawl_time = $info[self::CRAWL_TIME]; - $read_from_info = [ "crawl_order" => self::CRAWL_ORDER, "crawl_type" => self::CRAWL_TYPE, diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php index 745f5de6b..dca63de06 100755 --- a/src/library/LocaleFunctions.php +++ b/src/library/LocaleFunctions.php @@ -78,7 +78,7 @@ function guessLocale() ((isset($_SESSION['l'])) ? $_SESSION['l'] : $guess_l); if (strlen($l) < 10) { $l = addslashes($l); - if (is_dir(C\LOCALE_DIR."/". str_replace("-", "_", $l))) { + if (is_dir(C\LOCALE_DIR . "/" . str_replace("-", "_", $l))) { $locale_tag = $l; } } @@ -480,4 +480,4 @@ function formatDateByLocale($timestamp, $locale_tag) default: return date("F d Y H:i", intval($timestamp)); } -} \ No newline at end of file +} diff --git a/src/library/Utility.php b/src/library/Utility.php index fa10b1df4..fd53acd5b 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -1185,9 +1185,11 @@ function findMaterialMetas($metas, $encode_metas) */ function encodeMaterialMetas($metas, $encode_metas) { + if (!is_array($encode_metas) || empty($encode_metas)) { + return ""; + } $found_materialized_metas = findMaterialMetas($metas, $encode_metas); $meta_string = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"; - if (!is_array($encode_metas)) return ""; foreach ($found_materialized_metas as $name => $values) { foreach ($values as $value) { if ($name == 'class:' && isset($value[6])) { diff --git a/src/library/indexing_plugins/RecipePlugin.php b/src/library/indexing_plugins/RecipePlugin.php index 38cb454c4..68ddcce11 100644 --- a/src/library/indexing_plugins/RecipePlugin.php +++ b/src/library/indexing_plugins/RecipePlugin.php @@ -3,7 +3,7 @@ * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * - * Copyright (C) 2011 - 2014 Priya Gangaraju priya.gangaraju@gmail.com, + * Copyright (C) 2011 - 2017 Priya Gangaraju priya.gangaraju@gmail.com, * Chris Pollett, chris@pollett.org * * LICENSE: @@ -27,7 +27,7 @@ * chris@pollett.org * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ - * @copyright 2011 -2014 + * @copyright 2011 -2017 * @filesource */ namespace seekquarry\yioop\library\indexing_plugins; @@ -37,8 +37,10 @@ use seekquarry\yioop\controllers\SearchController; use seekquarry\yioop\library as L; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\IndexShard; +use seekquarry\yioop\library\IndexArchiveBundle; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\processors\HtmlProcessor; +use seekquarry\yioop\library\UrlParser; /** Don't try to use file cache either*/ if (!C\nsdefined("USE_CACHE")) { @@ -68,10 +70,35 @@ require_once C\BASE_DIR . "/library/LocaleFunctions.php"; */ class RecipePlugin extends IndexingPlugin implements CrawlConstants { + /** + * Ingredients that are common to many recipes so unlikely to be the main + * ingredient for a recipe + * @var array + */ + public static $basic_ingredients = [ + 'onion','oil','cheese','pepper','sauce', + 'salt','milk','butter','flour','cake', + 'garlic','cream','soda','honey','powder', + 'sauce','water','vanilla','pepper','bread', + 'sugar','vanillaextract','celery', + 'seasoning','syrup','skewers','egg', + 'muffin','ginger','basil','oregano', + 'cinammon','cumin','mayonnaise','mayo', + 'chillipowder','lemon','greens','yogurt', + 'margarine','asparagus','halfhalf', + 'pancakemix','coffee','cookies','lime', + 'chillies','cilantro','rosemary', + 'vanillaextract','vinegar','shallots', + 'wine','cornmeal','nonstickspray']; /** * Ratio of clusters/total number of recipes seen */ const CLUSTER_RATIO = 0.1; + /** + * Number of recipes to put into a shard before switching shards while + * clustering + */ + const NUM_RECIPES_PER_SHARD = 1000; /** * This method is called by a PageProcessor in its handle() method * just after it has processed a web page. This method allows @@ -103,31 +130,25 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants //detect recipes $recipes_per_page = $xpath->evaluate( /*allr, f.com, brec, fnet*/ - "/html//ul[@class = 'ingredient-wrap']| - /html//*[@class = 'pod ingredients'] | - /html//*[@itemtype='http://data-vocabulary.org/Recipe']| - /html//div[@class = 'rcp-head clrfix']| - /html//h1[@class = 'fn recipeDetailHeading']"); + "/html//header[@class='recipe']/h1| + /html//*[@id='recipe_title']| + /html//*[@itemtype='http://schema.org/Recipe']"); $recipe = []; $subdocs_description = []; if (is_object($recipes_per_page) && $recipes_per_page->length != 0) { $recipes_count = $recipes_per_page->length; $titles = $xpath->evaluate( /* allr, f.com, brec, fnet */ - "/html//*[@id = 'itemTitle']| - /html//h1[@class = 'fn']| - /html//*[@itemprop='name']| - /html//div[@class ='rcp-head clrfix']/h1 | - /html//h1[@class = 'fn recipeDetailHeading']"); - for ($i=0; $i < $recipes_count; $i++) { + "/html//*[@id = 'recipe_title']| + /html//header[@class = 'recipe']/h1| + /html//*[@itemprop='name']"); + for ($i = 0; $i < $recipes_count; $i++) { $ingredients = $xpath->evaluate( /*allr*, fcomm, brec, fnet*/ "/html//ul[@class = 'ingredient-wrap']/li | - /html//li[@class = 'ingredient']| - /html//*[@class = 'ingredients']/*| - /html//*[itemtype=". - "'http://data-vocabulary.org/RecipeIngredient'] - "); + /html//li[@data-ingredient]| + /html//*[@itemprop ='ingredient']| + /html//*[@itemprop='ingredients']"); $ingredients_result = ""; if (is_object($ingredients) && $ingredients->length != 0){ $lastIngredient = end($ingredients); @@ -166,8 +187,6 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants L\crawlLog("...Aborting plugin"); return; } - $locale_tag = L\guessLocale(); - L\setLocaleObject($locale_tag); $search_controller = new SearchController(); $query = "recipe:all i:$index_name"; L\crawlLog("...Running Recipe Plugin!"); @@ -175,216 +194,241 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants $more_docs = true; $raw_recipes = []; $limit = 0; + $up_to_last_shard_num = 0; $num = 100; + $added_recipes = false; + restore_error_handler(); while($more_docs) { - $results = @$search_controller->queryRequest($query, - $num, $limit, 1, $index_name); - if (isset($results["PAGES"]) && - ($num_results = count($results["PAGES"])) > 0 ) { - $raw_recipes = array_merge($raw_recipes, $results["PAGES"]); - } - L\crawlLog("Scanning recipes $limit through ". - ($limit + $num_results)."."); - $limit += $num_results; - if (isset($results["SAVE_POINT"]) ){ - $end = true; - foreach ($results["SAVE_POINT"] as $save_point) { - if ($save_point != -1) { - $end = false; - } + $added_recipes_round = false; + while($more_docs || + $limit > $up_to_last_shard_num + self::NUM_RECIPES_PER_SHARD) { + $results = @$search_controller->queryRequest($query, + $num, $limit, 1, $index_name); + if (isset($results["PAGES"]) && + ($num_results = count($results["PAGES"])) > 0 ) { + $raw_recipes = array_merge($raw_recipes, $results["PAGES"]); } - if ($end) { + L\crawlLog("Scanning recipes $limit through ". + ($limit + $num_results)."."); + $limit += $num_results; + if (isset($results["SAVE_POINT"]) ){ + $end = true; + foreach ($results["SAVE_POINT"] as $save_point) { + if ($save_point != -1) { + $end = false; + } + } + if ($end) { + $more_docs = false; + } + } else { $more_docs = false; } - } else { - $more_docs = false; } - } - L\crawlLog("...Clustering."); - // only cluster if would make more than one cluster - if (count($raw_recipes) * self::CLUSTER_RATIO > 1 ) { - $recipes = []; - $i = 0; - foreach ($raw_recipes as $raw_recipe) { - $description = $raw_recipe[self::DESCRIPTION]; - $ingredients = explode("||", $description); - if (is_array($ingredients) && count($ingredients) > 1) { - $recipes[$i][0]= $raw_recipe[self::TITLE]; - $recipes[$i][1] = $ingredients; - $recipes[$i][2] = L\crawlHash($raw_recipe[self::URL]); - $recipes[$i][3] = $raw_recipe; - $i++; + set_error_handler(C\NS_LIB . "yioop_error_handler"); + L\crawlLog("...Clustering from $up_to_last_shard_num to "); + $up_to_last_shard_num += self::NUM_RECIPES_PER_SHARD; + L\crawlLog("...to $up_to_last_shard_num"); + // only cluster if would make more than one cluster + if (count($raw_recipes) * self::CLUSTER_RATIO > 1 ) { + $recipes = []; + $i = 0; + foreach ($raw_recipes as $raw_recipe) { + $ingredients = explode("||", + $raw_recipe[self::DESCRIPTION]); + if (is_array($ingredients) && count($ingredients) > 1) { + $recipes[$i][0]= $raw_recipe[self::TITLE]; + $recipes[$i][1] = $ingredients; + $recipes[$i][2] = L\crawlHash($raw_recipe[self::URL]); + $recipes[$i][3] = $raw_recipe; + $i++; + } } - } - $recipes_ingredients = []; - $count = count($recipes); - foreach ($recipes as $key => $recipe) { - foreach ($recipe[1] as $index => $ingredient) { - if (strlen($ingredient) != 0 && ( - substr($ingredient, - strlen($ingredient) - 1) != ":")) { - $mainIngredient = - $this->getIngredientName((string)$ingredient); - if (strlen($mainIngredient) != 0) { - $recipe[1][$index] = $mainIngredient; + $recipes_ingredients = []; + $count = count($recipes); + foreach ($recipes as $key => $recipe) { + foreach ($recipe[1] as $index => $ingredient) { + if (strlen($ingredient) != 0 && ( + substr($ingredient, + strlen($ingredient) - 1) != ":")) { + $main_ingredient = + $this->getIngredientName((string)$ingredient); + if (strlen($main_ingredient) != 0) { + $recipe[1][$index] = $main_ingredient; + } else { + unset($recipe[1][$index]); + } } else { unset($recipe[1][$index]); } - } else { - unset($recipe[1][$index]); } + $recipes[$key] = $recipe; } - $recipes[$key] = $recipe; - } - $count = count($recipes); - $k = 0; - $basic_ingredients = [ - 'onion','oil','cheese','pepper','sauce', - 'salt','milk','butter','flour','cake', - 'garlic','cream','soda','honey','powder', - 'sauce','water','vanilla','pepper','bread', - 'sugar','vanillaextract','celery', - 'seasoning','syrup','skewers','egg', - 'muffin','ginger','basil','oregano', - 'cinammon','cumin','mayonnaise','mayo', - 'chillipowder','lemon','greens','yogurt', - 'margarine','asparagus','halfhalf', - 'pancakemix','coffee','cookies','lime', - 'chillies','cilantro','rosemary', - 'vanillaextract','vinegar','shallots', - 'wine','cornmeal','nonstickspray']; - for ($i = 0; $i < $count; $i++) { - $recipe1_main_ingredient = ""; - $recipe1 = $recipes[$i][1]; - $recipe_name = $recipes[$i][0]; - $recipe1_title = strtolower($recipes[$i][0]); - $distinct_ingredients[$recipe_name] = $recipes[$i][1]; - $doc_keys[$recipe_name] = $recipes[$i][2]; - $recipes_summary[$recipe_name] = $recipes[$i][3]; - for ($j = $i + 1; $j < $count; $j++) { - $recipe2_main_ingredient = ""; - $recipe2 = $recipes[$j][1]; - $recipe2_title = strtolower($recipes[$j][0]); - $weights[$k][0] = $recipes[$i][0]; - $weights[$k][1] = $recipes[$j][0]; - $merge_array = array_merge($recipe1, $recipe2); - $vector_array = array_unique($merge_array); - sort($vector_array); - $recipe1_vector = array_fill_keys($vector_array, 0); - $recipe2_vector = array_fill_keys($vector_array, 0); + $num_recipes = count($recipes); + $k = 0; + $weights = []; + $distinct_ingredients = []; + for ($i = 0; $i < $num_recipes; $i++) { + $recipe1_main_ingredient = ""; + // recipe1 is an array of ingredients + $recipe1 = $recipes[$i][1]; + $recipe_name = $recipes[$i][0]; + $recipe1_title = strtolower($recipes[$i][0]); + $distinct_ingredients[$recipe_name] = $recipes[$i][1]; + $doc_keys[$recipe_name] = $recipes[$i][2]; + $recipes_summary[$recipe_name] = $recipes[$i][3]; foreach ($recipe1 as $ingredient){ - if ($ingredient != "" && - !in_array($ingredient, $basic_ingredients)) { + if ($ingredient != "" && !in_array($ingredient, + RecipePlugin::$basic_ingredients)) { if (strstr($recipe1_title, $ingredient)) { $recipe1_main_ingredient = $ingredient; + break; } } - $recipe1_vector[$ingredient] = 1; } - foreach ($recipe2 as $ingredient) { - if ($ingredient != ""&& ! - in_array($ingredient, $basic_ingredients)) { - if (strstr($recipe2_title, $ingredient)) { - $recipe2_main_ingredient = $ingredient; - } + for ($j = $i + 1; $j < $num_recipes; $j++) { + $recipe2_main_ingredient = ""; + // recipe2 is an array of ingredients + $recipe2 = $recipes[$j][1]; + $recipe2_title = strtolower($recipes[$j][0]); + $weights[$k][0] = $recipes[$i][0]; + $weights[$k][1] = $recipes[$j][0]; + $merge_array = array_merge($recipe1, $recipe2); + $vector_array = array_unique($merge_array); + sort($vector_array); + $recipe1_vector = array_fill_keys($vector_array, 0); + $recipe2_vector = array_fill_keys($vector_array, 0); + foreach ($recipe1 as $ingredient) { + $recipe1_vector[$ingredient] = 1; } - $recipe2_vector[$ingredient] = 1; + $main_ingredient_found = false; + foreach ($recipe2 as $ingredient) { + if (!$main_ingredient_found && $ingredient != "" && + !in_array($ingredient, + RecipePlugin::$basic_ingredients)) { + if (strstr($recipe2_title, $ingredient)) { + $recipe2_main_ingredient = $ingredient; + $main_ingredient_found = true; + } + } + $recipe2_vector[$ingredient] = 1; + } + $edge_weight = 0; + $matches = 1; + foreach ($vector_array as $vector) { + $diff = $recipe1_vector[$vector] - + $recipe2_vector[$vector]; + $vector_diff[$vector] = (pow($diff, 2)); + if (abs($diff) == 1) + $matches += 1; + $edge_weight += $vector_diff[$vector]; + } + $main_ingredient_match = 1; + /* recipes that share main ingredients should have + heavier weighted edges between them + */ + if ($recipe1_main_ingredient == + $recipe2_main_ingredient) { + $main_ingredient_match = 1000; + } + $edge_weight = sqrt($edge_weight) * + $matches * $main_ingredient_match; + $weights[$k][2] = $edge_weight; + $k++; } - $edge_weight = 0; - $matches = 1; - foreach ($vector_array as $vector) { - $diff = $recipe1_vector[$vector] - - $recipe2_vector[$vector]; - $vector_diff[$vector] = (pow($diff, 2)); - if (abs($diff) == 1) - $matches += 1; - $edge_weight += $vector_diff[$vector]; + } + /* $weights at this point is an array + [vertex_0, vertex_1, edge_weight] + */ + $clusters = $this->getClusters($weights, $distinct_ingredients); + L\crawlLog("...Making new shard with clustered ". + "recipes as docs."); + $index_shard = new IndexShard("cluster_shard"); + $word_lists = []; + $recipe_sites = []; + foreach ($clusters as $cluster) { + $num_recipes_cluster = count($cluster); + for ($i = 0; $i < $num_recipes_cluster - 1; $i++) { + $meta_ids = []; + $summary = []; + $recipe = $cluster[$i]; + $summary[self::URL] = + $recipes_summary[$recipe][self::URL]; + $summary[self::TITLE] = + $recipes_summary[$recipe][self::TITLE]; + $summary[self::DESCRIPTION] = + $recipes_summary[$recipe][self::DESCRIPTION]; + $summary[self::TIMESTAMP] = + $recipes_summary[$recipe][self::TIMESTAMP]; + $summary[self::ENCODING] = + $recipes_summary[$recipe][self::ENCODING]; + $summary[self::HASH] = + $recipes_summary[$recipe][self::HASH]; + $doc_keys[$recipe] = + L\crawlHash($summary[self::URL], true); + $hash_rhost = "r". substr(L\crawlHash(//r is for recipe + UrlParser::getHost( + $summary[self::URL])."/", true), 1); + $doc_keys[$recipe] .= $summary[self::HASH] .$hash_rhost; + $summary[self::TYPE] = + $recipes_summary[$recipe][self::TYPE]; + $summary[self::HTTP_CODE] = + $recipes_summary[$recipe][self::HTTP_CODE]; + $recipe_sites[] = $summary; + $meta_ids[] = "ingredient:" . + trim($cluster["ingredient"]); + L\crawlLog("ingredient:" . $cluster["ingredient"]); + if (!$index_shard->addDocumentWords($doc_keys[$recipe], + self::NEEDS_OFFSET_FLAG, + $word_lists, $meta_ids, true, false)) { + L\crawlLog("Problem inserting recipe: ". + $summary[self::TITLE]); + } } - $main_ingredient_match = 1; - if ($recipe1_main_ingredient != $recipe2_main_ingredient) - $main_ingredient_match = 1000; - $edge_weight = sqrt($edge_weight) * - $matches * $main_ingredient_match; - $weights[$k][2] = $edge_weight; - $k++; } - } - L\crawlLog("...Making new shard with clustered recipes as docs."); - $clusters = kruskalClustering($weights, - $count, $distinct_ingredients); - $index_shard = new IndexShard("cluster_shard"); - $word_lists = []; - $recipe_sites = []; - foreach ($clusters as $cluster) { - $count = count($cluster); - for ($i = 0; $i < $count - 1; $i++) { - $meta_ids = []; - $summary = []; - $recipe = $cluster[$i]; - $summary[self::URL] = - $recipes_summary[$recipe][self::URL]; - $summary[self::TITLE] = - $recipes_summary[$recipe][self::TITLE]; - $summary[self::DESCRIPTION] = - $recipes_summary[$recipe][self::DESCRIPTION]; - $summary[self::TIMESTAMP] = - $recipes_summary[$recipe][self::TIMESTAMP]; - $summary[self::ENCODING] = - $recipes_summary[$recipe][self::ENCODING]; - $summary[self::HASH] = - $recipes_summary[$recipe][self::HASH]; - $doc_keys[$recipe] = - L\crawlHash($summary[self::URL], true); - $hash_rhost = "r". substr(L\crawlHash( // r is for recipe - UrlParser::getHost($summary[self::URL])."/",true), 1); - $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost; - $summary[self::TYPE] = - $recipes_summary[$recipe][self::TYPE]; - $summary[self::HTTP_CODE] = - $recipes_summary[$recipe][self::HTTP_CODE]; - $recipe_sites[] = $summary; - $meta_ids[] = "ingredient:".trim($cluster["ingredient"]); - L\crawlLog("ingredient:".$cluster["ingredient"]); - if (!$index_shard->addDocumentWords($doc_keys[$recipe], - self::NEEDS_OFFSET_FLAG, - $word_lists, $meta_ids, true, false)) { - L\crawlLog("Problem inserting recipe: ". - $summary[self::TITLE]); + $shard_string = $index_shard->save(true); + $index_shard = IndexShard::load("cluster_shard", + $shard_string); + unset($shard_string); + L\crawlLog("...Adding recipe shard to index archive bundle"); + $dir = C\CRAWL_DIR . "/cache/" . self::index_data_base_name . + $index_name; + if (empty($index_archive)) { + $index_archive = new IndexArchiveBundle($dir, false); + } + if ($index_shard->word_docs_packed) { + $index_shard->unpackWordDocs(); + } + if (!empty($recipe_sites)) { + $generation = $index_archive->initGenerationToAdd( + count($recipe_sites)); + L\crawlLog("... Adding ".count($recipe_sites). + " recipe docs."); + $index_archive->addPages($generation, + self::SUMMARY_OFFSET, $recipe_sites, 0); + $k = 0; + foreach ($recipe_sites as $site) { + $recipe = $site[self::TITLE]; + $hash = L\crawlHash($site[self::URL], true). + $site[self::HASH] . + "r". substr(L\crawlHash( // r is for recipe + UrlParser::getHost($site[self::URL])."/",true), 1); + $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; } + $index_shard->changeDocumentOffsets($summary_offsets); + $index_archive->addIndexData($index_shard); + $added_recipes_round = true; + $added_recipes = true; } } - $shard_string = $index_shard->save(true); - $index_shard = IndexShard::load("cluster_shard", - $shard_string); - unset($shard_string); - L\crawlLog("...Adding recipe shard to index archive bundle"); - $dir = C\CRAWL_DIR."/cache/".self::index_data_base_name.$index_name; - $index_archive = new IndexArchiveBundle($dir, false); - if ($index_shard->word_docs_packed) { - $index_shard->unpackWordDocs(); - } - $generation = $index_archive->initGenerationToAdd($index_shard); - if (isset($recipe_sites)) { - L\crawlLog("... Adding ".count($recipe_sites)." recipe docs."); - $index_archive->addPages($generation, - self::SUMMARY_OFFSET, $recipe_sites, 0); + if ($added_recipes) { + $index_archive->forceSave(); + $index_archive->addAdvanceGeneration(); + $index_archive->dictionary->mergeAllTiers(); + $this->db->setWorldPermissionsRecursive( + C\CRAWL_DIR.'/cache/'. + self::index_data_base_name.$index_name); } - $k = 0; - foreach ($recipe_sites as $site) { - $recipe = $site[self::TITLE]; - $hash = L\crawlHash($site[self::URL], true). - $site[self::HASH] . - "r". substr(L\crawlHash( // r is for recipe - UrlParser::getHost($site[self::URL])."/",true), 1); - $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; - } - $index_shard->changeDocumentOffsets($summary_offsets); - $index_archive->addIndexData($index_shard); - $index_archive->saveAndAddCurrentShardDictionary(); - $index_archive->dictionary->mergeAllTiers(); - $this->db->setWorldPermissionsRecursive( - C\CRAWL_DIR.'/cache/'. - self::index_data_base_name.$index_name); L\crawlLog("...Recipe plugin finished."); } } @@ -459,6 +503,82 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants $name = preg_replace('/[^a-zA-Z]/', "", $name); return $name; } + /** + * Creates tree from the input and apply Kruskal's algorithm to find minimal. + * spanning tree + * + * @param array $edges elements of form (recipe_1_title, recipe_2_title, weight) + * @return array $min_edges just those edges from the original edgest needed to + * make a minimal spanning + */ + private function extractMinimalSpanningTreeEdges($edges) + { + $vertices = []; + $tree_heap = new MinWeightedEdgeHeap(); + $vertice_no = 1; + for ($i = 0; $i < count($edges) - 1; $i++) { + $edge1 = new WeightedEdge($edges[$i][0], $edges[$i][1], + $edges[$i][2]); + $tree_heap->insert($edge1); + $vertex1 = $edge1->getStartVertex(); + $vertex2 = $edge1->getEndVertex(); + if (empty($vertices[$vertex1->getLabel()])){ + $vertices[$vertex1->getLabel()] = $vertice_no; + $vertice_no++; + } + if (empty($vertices[$vertex2->getLabel()])) { + $vertices[$vertex2->getLabel()] = $vertice_no; + $vertice_no++; + } + } + $k = 0; + $tree_heap->top(); + while($k < count($vertices) - 1) { + $min_edge = $tree_heap->extract(); + $vertex1= $min_edge->getStartVertex()->getLabel(); + $vertex2 = $min_edge->getEndVertex()->getLabel(); + if ($vertices[$vertex1] != $vertices[$vertex2]){ + if ($vertices[$vertex1] < $vertices[$vertex2]){ + $m = $vertices[$vertex2]; + $n = $vertices[$vertex1]; + } else { + $m = $vertices[$vertex1]; + $n = $vertices[$vertex2]; + } + foreach ($vertices as $vertex => $no){ + if ($no == $m) { + $vertices[$vertex] = $n; + } + } + $min_edges[] = $min_edge; + $k++; + } + } + return $min_edges; + } + /** + * Clusters the recipes from an array recipe adjacency weights. + * + * @param array $recipe_adjacency_weights array of triples + * (recipe_1_title, recipe_2_title, weight) + * @param array $distinct_ingredients list of possible ingredients + * @return array list of clusters of recipes. This array will have + * total_number_of_recipes * self::CLUSTER_RATIO many clusters. + * Each cluster will contain an ingredient field with the most common + * non basic ingredient found in the recipes of that cluster. + */ + private function getClusters($recipe_adjacency_weights, + $distinct_ingredients) + { + $minimal_spanning_tree_edges = $this->extractMinimalSpanningTreeEdges( + $recipe_adjacency_weights); + $recipe_clusterer = new RecipeClusterer($minimal_spanning_tree_edges); + $clusters = $recipe_clusterer->makeClusters(); + $clusters_with_ingredient_label = + $recipe_clusterer->addMostCommonIngredientClusters($clusters, + $distinct_ingredients); + return $clusters_with_ingredient_label; + } /** * Which mime type page processors this plugin should do additional * processing for @@ -484,7 +604,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants } } /** - * Vertex class for Recipe Clustering Minimal Spanning Tree + * Vertex class for used for Recipe Clustering */ class Vertex { @@ -534,9 +654,9 @@ class Vertex } } /** - * Directed Edge class for Recipe Clustering Minimal Spanning Tree + * Directed Edge class for Recipe Clustering */ -class Edge +class WeightedEdge { /** * Starting vertex of the directed edge this object represents @@ -597,11 +717,11 @@ class Edge * deleting the most expensive edge. BreadthFirstSearch is used to * traverse the MST. */ -class Tree +class RecipeClusterer { /** * Maintains a priority queue of edges ordered by max weight - * @var Cluster + * @var MaxWeightedEdgeHeap */ public $cluster_heap; /** @@ -617,19 +737,12 @@ class Tree /** * Constructs a tree suitable for building containing a Minimal Spanning * Tree for Kruskal clustering + * @param array $edges vertices and edge weights of MST */ - public function __construct() + public function __construct($edges) { - $this->cluster_heap = new Cluster(); + $this->cluster_heap = new MaxWeightedEdgeHeap(); $this->vertices = []; - } - /** - * Constructs the adjacency matrix for the MST. - * - * @param array $edges vertices and edge weights of MST - */ - public function constructMST($edges) - { foreach ($edges as $edge) { $this->cluster_heap->insert($edge); $vertex1 = $edge->getStartVertex(); @@ -644,55 +757,61 @@ class Tree $this->vertices[$vertex2->getLabel()] = $vertex2; } } - /** - * Forms the clusters by removing maximum weighted edges. - * performs breadth-first search to cluster the recipes. - * - * @param int $k queue size - * @param int $size number of recipes. - * @return array $cluster clusters of recipes. - */ - public function formCluster($k, $size) + /** + * Forms the clusters from $this->cluster_heap by removing maximum weighted + * edges. $this->cluster_heap is initially a weighted tree as edges are + * removed it becomes a forest. Once the number of trees in the forest + * reaches $num_recipes * RecipePlugin::CLUSTER_RATIO, the trees are treated + * as the clusters to be returned. + * + * @return array $clusters each element of $clusters is an array of + * recipe names + */ + public function makeClusters() { $this->cluster_heap->top(); - $nodeQueue = new Queue($k); - $cluster_count = $size * self::CLUSTER_RATIO; - $cluster = []; + $num_initial_edges = $this->cluster_heap->count(); + $num_recipes = count($this->vertices); + $node_queue = new Queue($num_initial_edges); + $num_cluster_to_make = $num_recipes * RecipePlugin::CLUSTER_RATIO; + $clusters = []; /* - Idea remove $cluster_count many weightiest edges from tree + Idea remove $num_cluster_to_make many weightiest edges from tree to get a forest. As do this add to queue end points of removed edges. */ - for ($j = 0; $j < $cluster_count - 1; $j++) { + for ($j = 0; $j < $num_cluster_to_make - 1; $j++) { $max_edge = $this->cluster_heap->extract(); $cluster1_start = $max_edge->getStartVertex()->getLabel(); $cluster2_start = $max_edge->getEndVertex()->getLabel(); $this->adjacency_matrix[$cluster1_start][$cluster2_start] = -1; $this->adjacency_matrix[$cluster2_start][$cluster1_start] = -1; - $nodeQueue->enqueue($cluster1_start); - $nodeQueue->enqueue($cluster2_start); + $node_queue->enqueue($cluster1_start); + $node_queue->enqueue($cluster2_start); } - $queue = new Queue($k); + $queue = new Queue($num_initial_edges); $i = 0; // Now use Queue above to make clusters (trees in resulting forest) - while(!$nodeQueue->isEmpty()) { - $node = $nodeQueue->dequeue(); - if ($this->vertices[$node]->isVisited() == false){ + while(!$node_queue->isEmpty()) { + $node = $node_queue->dequeue(); + if ($this->vertices[$node]->isVisited() == false) { $this->vertices[$node]->visited(); - $cluster[$i][] = $this->vertices[$node]->getLabel(); + $clusters[$i][] = $this->vertices[$node]->getLabel(); $queue->enqueue($this->vertices[$node]->getLabel()); while(!$queue->isEmpty()){ $node = $queue->dequeue(); - while(($nextnode = $this->getNextVertex($node)) != -1){ - $this->vertices[$nextnode]->visited(); - $cluster[$i][]= $this->vertices[$nextnode]->getLabel(); - $queue->enqueue($this->vertices[$nextnode]->getLabel()); + while(($next_node = $this->getNextVertex($node)) != -1){ + $this->vertices[$next_node]->visited(); + $clusters[$i][]= $this->vertices[ + $next_node]->getLabel(); + $queue->enqueue($this->vertices[ + $next_node]->getLabel()); } } } $i++; } - return $cluster; + return $clusters; } /** * Gets the next vertex from the adjacency matrix for a given vertex @@ -711,25 +830,18 @@ class Tree return -1; } /** - * Finds the common ingredient for each of the clusters. + * Adds to each element of an array of recipe clusters, + * a field ingredient containing the string name of the most common, + * non-basic ingredient found in that cluster. * * @param array $clusters clusters of recipes. * @param array $ingredients array of ingredients of recipes. * @return array $new_clusters clusters with common ingredient appended. */ - public function findCommonIngredient($clusters, $ingredients) + public function addMostCommonIngredientClusters($clusters, $ingredients) { $k =1; $new_clusters = []; - $basic_ingredients = ["onion", "oil", "cheese", "pepper", "sauce", - "salt", "milk", "butter", 'flour', 'cake', 'garlic','cream','soda', - 'honey','powder','sauce','water','vanilla','pepper','bread', - 'sugar','vanillaextract','celery','seasoning','syrup','skewers', - 'egg','muffin','ginger','basil','oregano','cinammon','cumin', - 'mayonnaise','mayo','chillipowder','lemon','greens','yogurt', - 'margarine','asparagus','halfhalf','pancakemix','coffee', - 'cookies','lime','chillies','cilantro','rosemary','vanillaextract', - 'vinegar','shallots','wine','cornmeal','nonstickspray']; foreach ($clusters as $cluster) { $recipes_count = 0; $cluster_recipe_ingredients = []; @@ -737,16 +849,21 @@ class Tree for ($i = 0; $i < count($cluster); $i++){ $recipe_name = $cluster[$i]; $main_ingredients = - array_diff($ingredients[$recipe_name],$basic_ingredients); + array_diff($ingredients[$recipe_name], + RecipePlugin::$basic_ingredients); $cluster_recipe_ingredients = array_merge( $cluster_recipe_ingredients, array_unique($main_ingredients)); } - $ingredient_occurrence = + $ingredient_occurrences = array_count_values($cluster_recipe_ingredients); - $max = max($ingredient_occurrence); - foreach ($ingredient_occurrence as $key => $value){ - if ($max == $value && !in_array($key, $basic_ingredients)) { + if (empty($ingredient_occurrences)) { + continue; + } + $max = max($ingredient_occurrences); + foreach ($ingredient_occurrences as $key => $value){ + if ($max == $value && !in_array($key, + RecipePlugin::$basic_ingredients)) { $common_ingredients[] = $key; } } @@ -760,13 +877,13 @@ class Tree } if (class_exists("\SplHeap")) { /** - * Heap to maintain the MST + * Heap used during clustering to select next edge to use to cluster */ - class Cluster extends \SplHeap + class MaxWeightedEdgeHeap extends \SplHeap { /** - * Compares the weights of two edges and returns -1, 0, 1 depending - * on which is the largest first, equal, or second + * Compares edge costs and returns -1, 0, 1 so that the edge with + * maximum cost/weight is at the top of heap * * @param Edge $edge1 first Edge to compare * @param Edge $edge2 second Edge to compare @@ -781,13 +898,13 @@ if (class_exists("\SplHeap")) { } } /** - * Heap to maintain the tree + * Heap used to compute minimal spanning tree */ - class TreeCluster extends \SplHeap + class MinWeightedEdgeHeap extends \SplHeap { /** - * Compares the weights of two edges and returns -1, 0, 1 depending - * on which is the largest first, equal, or second + * Compares edge costs and returns -1, 0, 1 so that the edge with + * minimum cost/weight is at the top of heap * * @param Edge $edge1 first Edge to compare * @param Edge $edge2 second Edge to compare @@ -872,72 +989,3 @@ class Queue } } -/** - * Creates tree from the input and apply Kruskal's algorithm to find MST. - * - * @param array $edges recipes with distances between them. - * @return object arrat $min_edges MST - */ -function construct_tree($edges) -{ - $vertices = []; - $tree_heap = new TreeCluster(); - $vertice_no = 1; - for ($i = 0; $i < count($edges) - 1; $i++) { - $edge1 = new Edge($edges[$i][0], $edges[$i][1], $edges[$i][2]); - $tree_heap->insert($edge1); - $vertex1 = $edge1->getStartVertex(); - $vertex2 = $edge1->getEndVertex(); - if (empty($vertices[$vertex1->getLabel()])){ - $vertices[$vertex1->getLabel()] = $vertice_no; - $vertice_no++; - } - if (empty($vertices[$vertex2->getLabel()])){ - $vertices[$vertex2->getLabel()] = $vertice_no; - $vertice_no++; - } - } - $k = 0; - $tree_heap->top(); - while($k < count($vertices) - 1) { - - $min_edge = $tree_heap->extract(); - $vertex1= $min_edge->getStartVertex()->getLabel(); - $vertex2 = $min_edge->getEndVertex()->getLabel(); - if ($vertices[$vertex1] != $vertices[$vertex2]){ - if ($vertices[$vertex1] < $vertices[$vertex2]){ - $m = $vertices[$vertex2]; - $n = $vertices[$vertex1]; - } else { - $m = $vertices[$vertex1]; - $n = $vertices[$vertex2]; - } - foreach ($vertices as $vertex => $no){ - if ($no == $m){ - $vertices[$vertex] = $n; - } - } - $min_edges[] = $min_edge; - $k++; - } - } - return $min_edges; -} -/** - * Clusters the recipes by applying Kruskal's algorithm - * - * @param array $edges array of triples (recipe_1_title, recipe_2_title, weight) - * @param int $count number of recipes. - * @param array $distinct_ingredients list of possible ingredients - * @return clusters of recipes. - */ -function kruskalClustering($edges, $count, $distinct_ingredients) -{ - $mst_edges = construct_tree($edges); - $mst = new Tree(); - $mst->constructMST($mst_edges); - $clusters = $mst->formCluster(count($mst_edges), $count); - $new_clusters = $mst->findCommonIngredient($clusters, - $distinct_ingredients); - return $new_clusters; -} diff --git a/src/library/processors/SvgProcessor.php b/src/library/processors/SvgProcessor.php index 7cfcd3d12..87cb6b0c4 100644 --- a/src/library/processors/SvgProcessor.php +++ b/src/library/processors/SvgProcessor.php @@ -99,7 +99,7 @@ class SvgProcessor extends TextProcessor $summary[self::THUMB] = 'data:image/svg+xml;base64,'. base64_encode($thumb_string); } - }else { + } else { $summary = parent::process($page, $url); } } @@ -144,7 +144,9 @@ class SvgProcessor extends TextProcessor public static function dom($page) { $dom = new \DOMDocument(); + restore_error_handler(); @$dom->loadXML($page); + set_error_handler(C\NS_LIB . "yioop_error_handler"); return $dom; } /** diff --git a/src/models/SigninModel.php b/src/models/SigninModel.php index 38122cee3..fc817385d 100755 --- a/src/models/SigninModel.php +++ b/src/models/SigninModel.php @@ -56,8 +56,11 @@ class SigninModel extends Model return false; } $row = $db->fetchArray($result); + if (empty($row)) { + return false; + } // avoid timeing attacks if possible - if (function_exists('hash_equals')) { + if (function_exists('hash_equals') ) { return hash_equals(L\crawlCrypt($password, $row['PASSWORD']), $row['PASSWORD']); }