diff --git a/browser_extensions/firefox/TOOLBAR INSTALL.txt b/browser_extensions/firefox/TOOLBAR INSTALL.txt new file mode 100644 index 000000000..f3856b08c --- /dev/null +++ b/browser_extensions/firefox/TOOLBAR INSTALL.txt @@ -0,0 +1,12 @@ +Steps to install the add-on + +Follow the below steps to install the project + + 1. Download the project zip file. Save it on your desktop. + 2. Unzip the file. + 3. Create a file named "sampletest@example.com". + Give the path of your extension folder in that file. + Eg: C:\Documents and Settings\<username>\Desktop\extensions\my_extension\ + 4. Save that file at the following location + C:\Documents and Settings\<username>\Application Data\Mozilla\Firefox\Profiles\<somename>.default\extensions\ + 5. Start the Firefox browser. diff --git a/browser_extensions/firefox/yiooptoolbar/chrome.manifest b/browser_extensions/firefox/yiooptoolbar/chrome.manifest new file mode 100644 index 000000000..d77cbcbee --- /dev/null +++ b/browser_extensions/firefox/yiooptoolbar/chrome.manifest @@ -0,0 +1,5 @@ +content yiooptoolbar chrome/content/ +overlay chrome://browser/content/browser.xul chrome://yiooptoolbar/content/main.xul +skin yioop_toolbar classic/1.0 chrome/skin/ + + diff --git a/browser_extensions/firefox/yiooptoolbar/chrome/content/main.css b/browser_extensions/firefox/yiooptoolbar/chrome/content/main.css new file mode 100644 index 000000000..23d5548e3 --- /dev/null +++ b/browser_extensions/firefox/yiooptoolbar/chrome/content/main.css @@ -0,0 +1,3 @@ +#toolbar-button { + list-style-image: url("chrome://yiooptoolbar/content/yioop_16x16.png"); +} diff --git a/browser_extensions/firefox/yiooptoolbar/chrome/content/main.js b/browser_extensions/firefox/yiooptoolbar/chrome/content/main.js new file mode 100644 index 000000000..f8b64d6db --- /dev/null +++ b/browser_extensions/firefox/yiooptoolbar/chrome/content/main.js @@ -0,0 +1,160 @@ + +function createXHR(){ + var request = false; + try { + request = new XMLHttpRequest(); + } + catch (err1) { + request = false; + } + return request; +} + +/** + * Deletes the rows in the table after sending + * toolbar data to the Yioop! + */ +function deleteRows(){ + var file = Components.classes["@mozilla.org/file/directory_service;1"] + .getService(Components.interfaces.nsIProperties) + .get("ProfD", Components.interfaces.nsIFile); + file.append("user_searchcapture.sqlite"); + + var storageService = Components.classes["@mozilla.org/storage/service;1"] + .getService(Components.interfaces.mozIStorageService); + var mDBConn = storageService.openDatabase(file); + // Will also create the file if it does not exist + + var statement = mDBConn.createStatement("DELETE FROM search_capture"); + statement.executeAsync(); +} + +/** + * Makes a legitimate POST request to Yioop! + * to send toolbar data to the Yioop! + */ + +function uploadAsyc(url, record){ + // url is the script and data is a string of parameters + params = "c=toolbar&a=toolbarTraffic&b=" + record; + var xhr = createXHR(); + xhr.onreadystatechange=function(){ + if(xhr.readyState == 4) + { + // calls deleteRowsfunction on staus Ok + if(xhr.status == 200){ + deleteRows(); + } + } + }; + xhr.open("POST", url, true); + xhr.setRequestHeader("Content-Type", "application/x-www-form-urlencoded"); + xhr.send(params); +} + +/** + * Creates the sqlite database in profiles folder. + * creates and insers the required captured data from user clicks + * @ params event is to capture the user click event from linkclick funtion. + */ + +function getword(event){ + var language1 = content.document.getElementsByTagName("html")[0] + .getAttribute("lang"); + if(language1 == null){ + var language1 = content.document.getElementsByTagName("html")[0] + .getAttribute("xml:lang"); + } + var file = Components.classes["@mozilla.org/file/directory_service;1"] + .getService(Components.interfaces.nsIProperties) + .get("ProfD", Components.interfaces.nsIFile); + file.append("user_searchcapture.sqlite"); + + var storageService = Components.classes["@mozilla.org/storage/service;1"] + .getService(Components.interfaces.mozIStorageService); + var mDBConn = storageService.openDatabase(file); + // Will also create the file if it does not exist + + mDBConn.executeSimpleSQL("CREATE TABLE IF NOT EXISTS search_capture " + + "(word TEXT, searchurl TEXT, searchurl1 TEXT, " + + "timestamp TEXT, language TEXT)"); + + var stmt = mDBConn.createStatement("INSERT INTO search_capture " + + "(word,searchurl,searchurl1,timestamp,language) " + + "VALUES(:word1,:url1,:url2,:time1,:lang1)"); + + var params = stmt.newBindingParamsArray(); + + stmt.params.word1 = event.target.innerHTML; + stmt.params.url1 = window.content.location.href; + stmt.params.url2 = event.target.href; + stmt.params.time1 = new Date(); + stmt.params.lang1 = language1; + stmt.executeAsync(); + + sendCaptureTest(); + void commitTransaction(); +} + +/** + * Retrieves all the rows from the search_capture table and + * checks if the rows reached to the count 10 if true + * then calls the uploadAsync function to send toolbar data to Yioop! + */ + +function sendCaptureTest(){ + var yioopurl = "http://www.yioop.com/"; + var file = Components.classes["@mozilla.org/file/directory_service;1"] + .getService(Components.interfaces.nsIProperties) + .get("ProfD", Components.interfaces.nsIFile); + file.append("user_searchcapture.sqlite"); + + var storageService = Components.classes["@mozilla.org/storage/service;1"] + .getService(Components.interfaces.mozIStorageService); + var mDBConn = storageService.openDatabase(file); + // Will also create the file if it does not exist + + var colnew = new Array(); + var statement = mDBConn.createStatement("SELECT * FROM search_capture"); + + statement.executeAsync({ + handleResult: function(aResultSet) { + var i = 0; + let row = aResultSet.getNextRow(); + + for (var row = aResultSet.getNextRow(); row; row = aResultSet.getNextRow()){ + colnew[i] = row.getResultByName("word") + "|:|" + + row.getResultByName("searchurl") + "|:|" + + row.getResultByName("searchurl1")+ "|:|" + + row.getResultByName("timestamp") + "|:|" + + row.getResultByName("language") + "\n"; + ++i; + } + if(colnew.length >= 10){ + uploadAsyc(yioopurl, colnew); + } + }, + handleError: function(aError) { + alert("Error: " + aError.message); + }, + + handleCompletion: function(aReason) { + if (aReason != Components.interfaces + .mozIStorageStatementCallback.REASON_FINISHED) + alert("Query canceled or aborted!"); + } + }); + commitTransaction(); +} + +/** + * The very begining function which is loaded when a Firefox window with the + * Smart seach toolbar add-on. This stores all the hyperlinks in web page then + * calls the getword function on the click event i.e when user clciks on a link. + */ +function linkclick() { + var len = content.document.getElementsByTagName("a"); + for (var i=0; i<len.length; i++) { + len[i].addEventListener("click", getword, true) //invoke function + } +} diff --git a/browser_extensions/firefox/yiooptoolbar/chrome/content/main.xul b/browser_extensions/firefox/yiooptoolbar/chrome/content/main.xul new file mode 100644 index 000000000..2a5c1ad9b --- /dev/null +++ b/browser_extensions/firefox/yiooptoolbar/chrome/content/main.xul @@ -0,0 +1,27 @@ +<?xml version="1.0"?> + +<?xml-stylesheet + href="chrome://yiooptoolbar/content/main.css" type="text/css" ?> +<overlay id="sample" + xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul" > + +<script type="application/x-javascript" + src="chrome://yiooptoolbar/content/main.js" /> + +<script> + document.addEventListener("load", function() { linkclick(); }, true); +</script> + +<window id="main-window"> + +<toolbox id="navigator-toolbox"> + +<toolbar id="tool-toolbar" toolbarname="Smartsearch Toolbar" accesskey="T" + class="chromeclass-toolbar" context="toolbar-context-menu" + hidden="false" persist="hidden"> + +<toolbarbutton id="toolbar-button" label="SmartSearch" value="3"/> +</toolbar> +</toolbox> +</window> +</overlay> diff --git a/browser_extensions/firefox/yiooptoolbar/chrome/content/yioop_16x16.png b/browser_extensions/firefox/yiooptoolbar/chrome/content/yioop_16x16.png new file mode 100644 index 000000000..4a9f6b46d Binary files /dev/null and b/browser_extensions/firefox/yiooptoolbar/chrome/content/yioop_16x16.png differ diff --git a/browser_extensions/firefox/yiooptoolbar/chrome/content/yioop_32x32.png b/browser_extensions/firefox/yiooptoolbar/chrome/content/yioop_32x32.png new file mode 100644 index 000000000..35d06d6ec Binary files /dev/null and b/browser_extensions/firefox/yiooptoolbar/chrome/content/yioop_32x32.png differ diff --git a/browser_extensions/firefox/yiooptoolbar/install.rdf b/browser_extensions/firefox/yiooptoolbar/install.rdf new file mode 100644 index 000000000..f5d06c7ff --- /dev/null +++ b/browser_extensions/firefox/yiooptoolbar/install.rdf @@ -0,0 +1,30 @@ +<?xml version="1.0"?> + +<RDF xmlns="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:em="http://www.mozilla.org/2004/em-rdf#"> + +<Description about="urn:mozilla:install-manifest"> + <em:id>yioop@seekquarry.com</em:id> + <em:version>1.0</em:version> + <em:type>2</em:type> + +<!-- Target Application this extension can install into, + with minimum and maximum supported versions. --> +<em:targetApplication> + <Description> + <em:id>{ec8030f7-c20a-464f-9b0e-13a3a9e97384}</em:id> + <em:minVersion>1.5</em:minVersion> + <em:maxVersion>4.0.*</em:maxVersion> + </Description> +</em:targetApplication> + +<!-- Front End MetaData --> + <em:name>Yioop! Toolbar</em:name> + <em:creator>Seekquarry.com</em:creator> + <em:description>Used to send web traffic data to the Yioop! +Search Engine</em:description> + <em:homepageURL>http://www.seekquarry.com/</em:homepageURL> + <em:iconURL>chrome://yiooptoolbar/content/yioop_32x32.png</em:iconURL> + +</Description> +</RDF> diff --git a/lib/indexing_plugins/indexing_plugin.php b/lib/indexing_plugins/indexing_plugin.php new file mode 100644 index 000000000..00348a3db --- /dev/null +++ b/lib/indexing_plugins/indexing_plugin.php @@ -0,0 +1,79 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2011 Priya Gangaraju priya.gangaraju@gmail.com + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Priya Gangaraju priya.gangaraju@gmail.com + * @package seek_quarry + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2011 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * Indexing Component Class + * @author Priya Gangaraju + * @package seek_quarry + * @subpackage component + */ + +/** Some models might interface with a DBMS so load the DBMS manager*/ +require_once BASE_DIR."/models/datasources/".DBMS."_manager.php"; + +abstract class IndexingPlugin +{ + /** + * list of models + */ + var $processors = array(); + var $models = array(); + var $index_archive; + var $db; + + function __construct() + { + $db_class = ucfirst(DBMS)."Manager"; + $this->db = new $db_class(); + + require_once BASE_DIR."/models/model.php"; + + foreach($this->models as $model) { + require_once BASE_DIR."/models/".$model."_model.php"; + + $model_name = ucfirst($model)."Model"; + $model_instance_name = lcfirst($model_name); + + $this->$model_instance_name = new $model_name(); + } + + } + + static function getProcessors() {return NULL;} + + static function getAdditionalMetaWords() {return array();} + + abstract function postProcessing($index_name); +} +?> diff --git a/lib/indexing_plugins/recipe_plugin.php b/lib/indexing_plugins/recipe_plugin.php new file mode 100644 index 000000000..b1a4d9bb1 --- /dev/null +++ b/lib/indexing_plugins/recipe_plugin.php @@ -0,0 +1,825 @@ + +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2011 Priya Gangaraju priya.gangaraju@gmail.com + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Priya Gangaraju priya.gangaraju@gmail.com + * @package seek_quarry + * @subpackage component + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2011 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * Flag to say that post_processing is occurring (used to control logging in + * models) + */ +define("POST_PROCESSING", true); + +/** + * Ratio of clusters/total number of recipes seen + */ +define("CLUSTER_RATIO", 0.1); + +/** Base indexing plugin class*/ +require_once BASE_DIR."/lib/indexing_plugins/indexing_plugin.php"; +/** Used to create index shards to add ingredient: entries + * to index + */ +require_once BASE_DIR."/lib/index_shard.php"; +/** Used to extract text from documents*/ +require_once BASE_DIR."/lib/phrase_parser.php"; +/** Get the crawlHash function */ +require_once BASE_DIR."/lib/utility.php"; +/** Loads common constants for web crawling */ +require_once BASE_DIR."/lib/crawl_constants.php"; + +/** + * This class handles recipe processing. + * It extracts ingredients from the recipe pages while crawling. + * It clusters the recipes using Kruskal's minimum spanning tree + * algorithm after crawl is stopped. This plugin was designed by + * looking at what was needed to screen scrape recipes from the + * following sites: + * + * http://allrecipes.com/ + * http://www.food.com/ + * http://www.betterrecipes.com/ + * http://www.foodnetwork.com/ + * http://www.bettycrocker.com/ + * + * + * @author Priya Gangaraju + * @package seek_quarry + * @subpackage component + */ + +class RecipePlugin extends IndexingPlugin implements CrawlConstants +{ + + /** + * The models used by this indexing plugin + * @var array + */ + var $models = array("phrase", "locale", "crawl"); + + + /** + * Which mime type page processors this plugin should do additional + * processing for + * + * @return array an array of page processors + */ + static function getProcessors() + { + return array("HtmlProcessor"); + } + + /** + * Returns an array of additional meta words which have been added by + * this plugin + * + * @return array meta words and maximum description length of results + * allowed for that meta word (in this case 2000 as want + * to allow sufficient descriptions of whole recipes) + */ + static function getAdditionalMetaWords() + { + return array("recipe:" => 2000, "ingredient:" => 2000); + } + + /** + * Extracts title and description from a recipe page. This is + * called by the PageProcessor (or subclass) handle($page, $url) method + * + * @param object $dom a document object to extract a description from. + * @return string a description of the page + */ + function pageProcessing($page, $url) + { + $page = preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page); + $page = preg_replace('/>/', '> ', $page); + $dom = HtmlProcessor::dom($page); + if($dom == NULL) return NULL; + + $xpath = new DOMXPath($dom); + $recipes_per_page = $xpath->evaluate( + "/html//div[@class = 'ingredients'] | + /html//div[@class = 'body-text'] | + /html//ul[@class = 'clr'] | + /html//div[@class = 'recipeDetails'] + /ul[@class='ingredient_list']"); + $recipe = array(); + $subdocs_description = array(); + if($recipes_per_page->length != 0) { + $recipes_count = $recipes_per_page->length; + $titles = $xpath->evaluate( + "/html//div[@class='rectitle'] | + /html//h1[@class = 'fn'] | + /html//div[@class = + 'pod about-recipe clrfix']/p | + /html//h1[@class = 'recipeTitle']"); + for($i=0; $i<$recipes_count;$i++) { + $ingredients = $xpath->evaluate("/html//div[@class = + 'ingredients']/ul/li | + /html//div[@class = 'body-text'] + /ul/li[@class = 'ingredient'] | + /html//ul[@class = 'clr']/li | + /html//div[@class = 'recipeDetails'] + /ul[@class='ingredient_list']/li | + /html//div[@class = 'ingredients'] + /table/tr[@class = 'ingredient']"); + $ingredients_result = ""; + if($ingredients->length != 0){ + $lastIngredient = end($ingredients); + foreach($ingredients as $ingredient) { + $content = trim($ingredient->textContent); + if(!empty($content)) { + if($content != $lastIngredient) + $ingredients_result .= $content."||"; + else + $ingredients_result .= $content; + } + } + $ingredients_result = mb_ereg_replace( + "(\s)+", " ", $ingredients_result); + } + $recipe[self::TITLE] = $titles->item($i)->textContent; + $recipe[self::DESCRIPTION] = $ingredients_result; + $subdocs_description[] = $recipe; + } + } + + return $subdocs_description; + } + + + /** + * Implements post processing of recipes. recipes are extracted + * ingredients are scrubbed and recipes are clustered. The clustered + * recipes are added back to the index. + * + * @param string $index_name index name of the current crawl. + */ + function postProcessing($index_name) + { + $this->phraseModel->index_name = $index_name; + $this->crawlModel->index_name = $index_name; + + $index_archive_name = self::index_data_base_name . $index_name; + $index_archive = new IndexArchiveBundle( + CRAWL_DIR.'/cache/'.$index_archive_name); + $query_iterator = new WordIterator(crawlHash("recipe:all"), + $index_archive); + $raw_recipes = array(); + while(is_array($next_docs = $query_iterator->nextDocsWithWord())) { + foreach($next_docs as $doc_key => $doc_info) { + $summary = & $doc_info[CrawlConstants::SUMMARY]; + $summary['KEY'] = $doc_key; + $tmp = unserialize($query_iterator->getIndex( + $doc_key)->description); + $doc_info[self::CRAWL_TIME] = $tmp[self::CRAWL_TIME]; + unset($doc_info[CrawlConstants::SUMMARY]); + if(is_array($summary)) { + $raw_recipes[] = array_merge($doc_info, $summary); + } + } + + } + // only cluster if would make more than one cluster + if(count($raw_recipes) * CLUSTER_RATIO > 1 ) { + $recipes = array(); + $i = 0; + foreach($raw_recipes as $raw_recipe) { + $description = $raw_recipe[self::DESCRIPTION]; + $ingredients = explode("||", $description); + if(is_array($ingredients) && count($ingredients) > 1) { + $recipes[$i][0]= $raw_recipe[self::TITLE]; + $recipes[$i][1] = $ingredients; + $recipes[$i][2] = $raw_recipe['KEY']; + $recipes[$i][3] = $raw_recipe; + $i++; + } + } + + $recipes_ingredients = array(); + $count = count($recipes); + foreach($recipes as $key => $recipe) { + foreach($recipe[1] as $index => $ingredient) { + if(strlen($ingredient) != 0 && ( + substr($ingredient, + strlen($ingredient) - 1) != ":")) { + $mainIngredient = + $this->getIngredientName((string)$ingredient); + if(strlen($mainIngredient) != 0) { + $recipe[1][$index] = $mainIngredient; + } else { + unset($recipe[1][$index]); + } + } else { + unset($recipe[1][$index]); + } + } + $recipes[$key] = $recipe; + } + $count = count($recipes); + $k = 0; + $basic_ingredients = array( + 'onion','oil','cheese','pepper','sauce', + 'salt','milk','butter','flour','cake', + 'garlic','cream','soda','honey','powder', + 'sauce','water','vanilla','pepper','bread', + 'sugar','vanillaextract','celery', + 'seasoning','syrup','skewers','egg', + 'muffin','ginger','basil','oregano', + 'cinammon','cumin','mayonnaise','mayo', + 'chillipowder','lemon','greens','yogurt', + 'margarine','asparagus','halfhalf', + 'pancakemix','coffee','cookies','lime', + 'chillies','cilantro','rosemary', + 'vanillaextract','vinegar','shallots', + 'wine','cornmeal','nonstickspray'); + + for($i = 0; $i < $count; $i++) { + $recipe1_main_ingredient = ""; + $recipe1 = $recipes[$i][1]; + $recipe_name = $recipes[$i][0]; + $recipe1_title = strtolower($recipes[$i][0]); + $distinct_ingredients[$recipe_name] = $recipes[$i][1]; + $doc_keys[$recipe_name] = $recipes[$i][2]; + $recipes_summary[$recipe_name] = $recipes[$i][3]; + + for($j = $i + 1; $j < $count; $j++) { + $recipe2_main_ingredient = ""; + $recipe2 = $recipes[$j][1]; + $recipe2_title = strtolower($recipes[$j][0]); + $weights[$k][0] = $recipes[$i][0]; + $weights[$k][1] = $recipes[$j][0]; + $merge_array = array_merge($recipe1, $recipe2); + $vector_array = array_unique($merge_array); + sort($vector_array); + $recipe1_vector = array_fill_keys($vector_array, 0); + $recipe2_vector = array_fill_keys($vector_array, 0); + foreach($recipe1 as $ingredient){ + if($ingredient != "" && + !in_array($ingredient,$basic_ingredients)) { + if(strstr($recipe1_title,$ingredient)) { + $recipe1_main_ingredient = $ingredient; + } + } + $recipe1_vector[$ingredient] = 1; + } + foreach($recipe2 as $ingredient) { + if($ingredient != ""&& ! + in_array($ingredient,$basic_ingredients)) { + if(strstr($recipe2_title,$ingredient)) { + $recipe2_main_ingredient = $ingredient; + } + } + $recipe2_vector[$ingredient] = 1; + } + $edge_weight = 0; + $matches = 1; + foreach($vector_array as $vector) { + $diff = $recipe1_vector[$vector] - + $recipe2_vector[$vector]; + $vector_diff[$vector] = (pow($diff, 2)); + if(abs($diff) == 1) + $matches += 1; + $edge_weight += $vector_diff[$vector]; + } + $main_ingredient_match = 1; + if($recipe1_main_ingredient != $recipe2_main_ingredient) + $main_ingredient_match = 1000; + $edge_weight = sqrt($edge_weight)* + $matches * $main_ingredient_match; + $weights[$k][2] = $edge_weight; + $k++; + } + } + + $clusters = kruskalClustering($weights, + $count, $distinct_ingredients); + $index_shard = new IndexShard("cluster_shard"); + $word_counts = array(); + $recipe_sites = array(); + + foreach($clusters as $cluster) { + $count = count($cluster); + for($i = 0; $i < $count - 1; $i++) { + $meta_ids = array(); + $summary = array(); + $recipe = $cluster[$i]; + $doc_key = $doc_keys[$recipe]; + $summary[self::URL] = + $recipes_summary[$recipe][self::URL]; + $summary[self::TITLE] = + $recipes_summary[$recipe][self::TITLE]; + $summary[self::DESCRIPTION] = + $recipes_summary[$recipe][self::DESCRIPTION]; + $summary[self::TIMESTAMP] = + $recipes_summary[$recipe][self::TIMESTAMP]; + $summary[self::ENCODING] = + $recipes_summary[$recipe][self::ENCODING]; + $summary[self::HASH] = + $recipes_summary[$recipe][self::HASH]; + $summary[self::TYPE] = + $recipes_summary[$recipe][self::TYPE]; + $summary[self::HTTP_CODE] = + $recipes_summary[$recipe][self::HTTP_CODE]; + $recipe_sites[] = $summary; + $meta_ids[] = "ingredient:".$cluster["ingredient"]; + $index_shard->addDocumentWords($doc_key, + self::NEEDS_OFFSET_FLAG, + $word_counts, $meta_ids, true, false); + $index_shard->save(true); + } + + } + + $dir = CRAWL_DIR."/cache/".self::index_data_base_name.$index_name; + $index_archive = new IndexArchiveBundle($dir, false); + $generation = $index_archive->initGenerationToAdd($index_shard); + if(isset($recipe_sites)) { + $index_archive->addPages($generation, + self::SUMMARY_OFFSET, $recipe_sites, 0); + } + $k = 0; + foreach($recipe_sites as $site) { + $recipe = $site[self::TITLE]; + $hash = crawlHash($site[self::URL], true). + $site[self::HASH] . + crawlHash("link:".$site[self::URL], true); + $summary_offsets[$hash] = + array($site[self::SUMMARY_OFFSET], null); + } + $index_shard->changeDocumentOffsets($summary_offsets); + $index_archive->addIndexData($index_shard); + $index_archive->saveAndAddCurrentShardDictionary(); + $index_archive->dictionary->mergeAllTiers(); + $this->db->setWorldPermissionsRecursive( + CRAWL_DIR.'/cache/'. + self::index_data_base_name.$index_name); + } + } + + + + /** + * Extracts the main ingredient from the ingredient. + * + * @param string $text ingredient. + * @return string $name main ingredient + */ + function getIngredientName($text) + { + $special_chars = array('/\d+/','/\\//'); + $ingredient = preg_replace($special_chars," ", $text); + $ingredient = strtolower($ingredient); + $varieties = array('apple','bread','cheese','chicken','shrimp', + 'tilapia','salmon','butter','chocolate','sugar','pepper','water', + 'mustard','cream','lettuce','sauce','crab','garlic','mushrooms', + 'tortilla','potatoes','steak','rice','vinegar','carrots', + 'marshmellows','onion','oil','ham','parsley','cilantro','broth', + 'stock','flour','seasoning','banana','pasta','noodles','pork', + 'bacon','olives','spinach','yogurt','celery','beans','egg', + 'apricot','whiskey','wine','milk','mango','tomato','lemon', + 'salsa','herbs','sourdough','prosciutto','seasoning','syrup', + 'honey','skewers','muffin','beef','cinammon','thyme','asparagus', + 'turkey','pumpkin'); + foreach($varieties as $variety){ + if(strstr($ingredient, $variety)) { + $ingredient = $variety; + } + } + $words = explode(' ', $ingredient); + $measurements = array('cup','cups','ounces','teaspoon','teaspoons', + 'tablespoon','tablespoons','pound','pounds','tbsp','tsp','lbs', + 'inch','pinch','oz','lb','tbs','can','bag','C','c','tb'); + + $sizes = array('small','large','thin','less','thick','bunch'); + + $prepositions = array('into', 'for', 'by','to','of'); + + $misc = array('hot','cold','room','temperature','plus','stick','pieces', + "confectioners",'semisweet','white','all-purpose','bittersweet', + 'cut','whole','or','and','french','wedges','package','pkg','shells', + 'cartilege','clean','hickory','fillets','fillet','plank','planks', + 'cedar','taste','spicy','glaze','crunchy','sharp','chips','juice', + 'optional','fine','regular','dash','overnight','soaked','classic', + 'firm','delicious','prefer','plain'); + + $attributes = array('boneless','skinless','breast','legs','thighs', + 'washington','fresh','flat','leaf','ground','extra','virgin','dry', + 'cloves','lean','ground','roma','all purpose','light','brown', + 'idaho','kosher','frozen','garnish'); + + $nouns = array(); + $i = 0; + $endings = array('/\,/','/\./','/\+/','/\*/',"/'/","/\(/","/\)/"); + foreach($words as $word) { + if($word != ''){ + $word = strtolower($word); + foreach($varieties as $variety){ + if(strstr($word,$variety)) + $word = $variety; + } + $word = preg_replace($endings,"",$word); + if(!in_array($word,$measurements) && !in_array($word,$sizes) + && !in_array($word,$prepositions) && !in_array($word,$misc) + && !in_array($word,$attributes)) { + $ending = substr($word, -2); + $ending2 = substr($word, -3); + if($ending != 'ly' && $ending != 'ed' && $ending2 != 'ing') + { + $nouns[] = $word; + } + } + } + } + $name = implode(" ", $nouns); + $name = preg_replace('/[^a-zA-Z]/', "", $name); + return $name; + } + +} +/** + * Gets the language tag (for instance, en_US for American English) of the + * locale that is currently being used. + * + * @return string "en-US" since for now the recipe plugin only works + * with English recipes + */ +if(!function_exists("getLocaleTag")) { + function getLocaleTag() + { + return "en_US"; + } +} + +/** + * class to define vertex + */ +class Vertex +{ + private $label; + private $visited; + + function __construct($label){ + $this->label = $label; + $this->visited = false; + } + + function getLabel(){ + return $this->label; + } + + function visited(){ + $this->visited = true; + } + + function isVisited(){ + return $this->visited; + } +} +/** + * class to define edge + */ +class Edge +{ + private $start_vertex; + private $end_vertex; + private $cost; + + function __construct($vertex1,$vertex2,$cost){ + $this->start_vertex = new Vertex($vertex1); + $this->end_vertex = new Vertex($vertex2); + $this->cost = $cost; + } + + function getStartVertex(){ + return $this->start_vertex; + } + + function getEndVertex(){ + return $this->end_vertex; + } + + function getCost(){ + return $this->cost; + } +} + +/** + * class to define Minimum Spanning tree. constructMST constructs + * the minimum spanning tree using heap. formCluster forms clusters by + * deleting the most expensive edge. BreadthFirstSearch is used to + * traverse the MST. + */ +class Tree +{ + private $cluster_heap; + private $vertices; + private $adjMatrix; + + function __construct(){ + $this->cluster_heap = new Cluster(); + $this->vertices = array(); + } + + /** + * constructs the adjacency matrix for the MST. + * + * @param object array $edges vertices and edge weights of MST + */ + function constructMST($edges) + { + foreach($edges as $edge) { + $this->cluster_heap->insert($edge); + $vertex1 = $edge->getStartVertex(); + $vertex2 = $edge->getEndVertex(); + $this->adjMatrix[$vertex1->getLabel()][$vertex2->getLabel()] = + $vertex2->getLabel(); + $this->adjMatrix[$vertex2->getLabel()][$vertex1->getLabel()] = + $vertex1->getLabel(); + if(empty($this->vertices) || !in_array($vertex1,$this->vertices)) + $this->vertices[$vertex1->getLabel()] = $vertex1; + if(empty($this->vertices) || !in_array($vertex2,$this->vertices)) + $this->vertices[$vertex2->getLabel()] = $vertex2; + } + + } + + /** + * forms the clusters by removing maximum weighted edges. + * performs breadth-first search to cluster the recipes. + * + * @param int $k queue size + * @param int $size number of recipes. + * @return array $cluster clusters of recipes. + */ + function formCluster($k, $size) + { + $this->cluster_heap->top(); + $nodeQueue = new Queue($k); + $cluster_count = $size * CLUSTER_RATIO; + $cluster = array(); + for($j = 0; $j < $cluster_count - 1; $j++) { + $max_edge = $this->cluster_heap->extract(); + $cluster1_start = $max_edge->getStartVertex()->getLabel(); + $cluster2_start = $max_edge->getEndVertex()->getLabel(); + $this->adjMatrix[$cluster1_start][$cluster2_start] = -1; + $this->adjMatrix[$cluster2_start][$cluster1_start] = -1; + $nodeQueue->enqueue($cluster1_start); + $nodeQueue->enqueue($cluster2_start); + } + $queue = new Queue($k); + $i=0; + while(!$nodeQueue->isEmpty()) { + $node = $nodeQueue->dequeue(); + if($this->vertices[$node]->isVisited() == false){ + $this->vertices[$node]->visited(); + $cluster[$i][] = $this->vertices[$node]->getLabel(); + $queue->enqueue($this->vertices[$node]->getLabel()); + while(!$queue->isEmpty()){ + $node = $queue->dequeue(); + while(($nextnode = $this->getNextVertex($node)) != -1){ + $this->vertices[$nextnode]->visited(); + $cluster[$i][]= $this->vertices[$nextnode]->getLabel(); + $queue->enqueue($this->vertices[$nextnode]->getLabel()); + } + } + } + $i++; + } + return $cluster; + } + + /** + * gets the next vertex from the adjacency matrix for a given vertex + * + * @param string $vertex vertex + * @return adjacent vertex if it has otherwise -1. + */ + function getNextVertex($vertex) + { + foreach($this->adjMatrix[$vertex] as $vert=>$value) { + if($value != -1 + && ($this->vertices[$value]->isVisited() == false)) { + return $this->adjMatrix[$vertex][$vert]; + } + + } + return -1; + } + + /** + * Finds the common ingredient for each of the clusters. + * + * @param array $clusters clusters of recipes. + * @param array $ingredients array of ingredients of recipes. + * @return array $new_clusters clusters with common ingredient appended. + */ + function findCommonIngredient($clusters,$ingredients) + { + $k =1; + $new_clusters = array(); + $basic_ingredients = array("onion","oil","cheese","pepper","sauce", + "salt","milk","butter",'flour','cake','garlic','cream','soda', + 'honey','powder','sauce','water','vanilla','pepper','bread', + 'sugar','vanillaextract','celery','seasoning','syrup','skewers', + 'egg','muffin','ginger','basil','oregano','cinammon','cumin', + 'mayonnaise','mayo','chillipowder','lemon','greens','yogurt', + 'margarine','asparagus','halfhalf','pancakemix','coffee', + 'cookies','lime','chillies','cilantro','rosemary','vanillaextract', + 'vinegar','shallots','wine','cornmeal','nonstickspray'); + foreach($clusters as $cluster) { + $recipes_count = 0; + $cluster_recipe_ingredients = array(); + $common_ingredients = array(); + for($i = 0; $i < count($cluster); $i++){ + $recipe_name = $cluster[$i]; + $main_ingredients = + array_diff($ingredients[$recipe_name],$basic_ingredients); + $cluster_recipe_ingredients = array_merge( + $cluster_recipe_ingredients, + array_unique($main_ingredients)); + } + $ingredient_occurrence = + array_count_values($cluster_recipe_ingredients); + $max = max($ingredient_occurrence); + foreach($ingredient_occurrence as $key=>$value){ + if($max == $value && !in_array($key, $basic_ingredients)) { + $common_ingredients[] = $key; + } + } + $cluster_ingredient = $common_ingredients[0]; + $cluster["ingredient"] = $cluster_ingredient; + $new_clusters[] = $cluster; + $k++; + } + return $new_clusters; + + } +} +/** + * heap to maintain the MST + */ +class Cluster extends SplHeap +{ + + public function compare($edge1,$edge2) + { + $values1 = $edge1->getCost(); + $values2 = $edge2->getCost(); + if ($values1 == $values2) return 0; + return $values1 < $values2 ? -1 : 1; + } +} +/** + * heap to maintain the tree + */ +class TreeCluster extends SplHeap +{ + + public function compare($edge1,$edge2) + { + $values1 = $edge1->getCost(); + $values2 = $edge2->getCost(); + if ($values1 == $values2) return 0; + return $values1 > $values2 ? -1 : 1; + } +} + +/** + * queue for the BFS traversal + */ +class Queue +{ + private $size; + private $queArray; + private $front; + private $rear; + + function __construct($size){ + $this->queArray = array(); + $this->front = 0; + $this->rear = -1; + $this->size = $size; + } + + function enqueue($i){ + if($this->rear == $this->size-1) + $this->rear = -1; + $this->queArray[++$this->rear] = $i; + } + + function dequeue(){ + $temp = $this->queArray[$this->front++]; + if($this->front == $this->size) + $this->front = 0; + return $temp; + } + function isEmpty(){ + if(($this->rear + 1)== $this->front || + ($this->front + $this->size - 1) == $this->rear) + return true; + return false; + } + +} +/** + * creates tree from the input and apply Kruskal's algorithm to find MST. + * + * @param object array $edges recipes with distances between them. + * @return object arrat $min_edges MST + */ +function construct_tree($edges) { + $vertices = array(); + $tree_heap = new TreeCluster(); + $vertice_no = 1; + for($i=0; $i < count($edges)-1; $i++) { + $edge1 = new Edge($edges[$i][0], $edges[$i][1], $edges[$i][2]); + $tree_heap->insert($edge1); + $vertex1 = $edge1->getStartVertex(); + $vertex2 = $edge1->getEndVertex(); + if(empty($vertices[$vertex1->getLabel()])){ + $vertices[$vertex1->getLabel()] = $vertice_no; + $vertice_no++; + } + if(empty($vertices[$vertex2->getLabel()])){ + $vertices[$vertex2->getLabel()] = $vertice_no; + $vertice_no++; + } + } + $k = 0; + $tree_heap->top(); + while($k < count($vertices) - 1) { + + $min_edge = $tree_heap->extract(); + $vertex1= $min_edge->getStartVertex()->getLabel(); + $vertex2 = $min_edge->getEndVertex()->getLabel(); + if($vertices[$vertex1] != $vertices[$vertex2]){ + if($vertices[$vertex1] < $vertices[$vertex2]){ + $m = $vertices[$vertex2]; + $n = $vertices[$vertex1]; + } + else{ + $m = $vertices[$vertex1]; + $n = $vertices[$vertex2]; + } + foreach($vertices as $vertex => $no){ + if($no == $m){ + $vertices[$vertex] = $n; + } + } + $min_edges[] = $min_edge; + $k++; + } + } + return $min_edges; +} + +/** + * Clusters the recipes by applying Kruskal's algorithm + * @param array $edges recipes and distances between them. + * + * @param int $count number of recipes. + * @param array $distinct_ingredients recipe names with ingredients. + * @return clusters of recipes. + */ +function kruskalClustering($edges, $count, $distinct_ingredients) +{ + $mst_edges = construct_tree($edges); + $mst = new Tree(); + $mst->constructMST($mst_edges); + $clusters = $mst->formCluster(count($mst_edges), $count); + $new_clusters = $mst->findCommonIngredient($clusters, + $distinct_ingredients); + return $new_clusters; +} +?> diff --git a/lib/processors/page_processor.php b/lib/processors/page_processor.php new file mode 100644 index 000000000..95a929b7f --- /dev/null +++ b/lib/processors/page_processor.php @@ -0,0 +1,135 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage processor + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010, 2011 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * Used by subclasses, so have succinct access (i.e., can use self:: rather + * than CrawlConstants::) to constants like: + * CrawlConstants::TITLE, CrawlConstants::DESCRIPTION, etc. + */ +require_once BASE_DIR."/lib/crawl_constants.php"; + +/** + * Base class common to all processors of web page data + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage processor + */ +abstract class PageProcessor implements CrawlConstants +{ + /** + * indexing_plugins which might be used with the current processor + * + * @var array + */ + var $indexing_plugins; + + /** + * Set-ups the any indexing plugins associated with this page + * processor + * + * @param array $plugins an array of indexing plugins which might + * do further processing on the data handles by this page + * processor + */ + function __construct($plugins = array()){ + $this->indexing_plugins = $plugins; + foreach($plugins as $plugin) { + $plugin_name = ucfirst($plugin); + $plugin_instance_name = lcfirst($plugin); + $this->$plugin_instance_name = new $plugin_name(); + } + } + + /** + * Method used to handle processing data for a web page. It makes + * a summary for the page (via the process() function which should + * be subclassed) as well as runs any plugins that are associated with + * the processors to create sub-documents + * + * @param string $page string of a web document + * @param string $url location the document came from + * + * @return array a summary of (title, description,links, and content) of + * the information in $page also has a subdocs array containing any + * subdocuments returned from a plugin. A subdocumenst might be + * things like recipes that appeared in a page or tweets, etc. + */ + function handle($page, $url) + { + $summary = $this->process($page, $url); + if($summary != NULL && isset($this->indexing_plugins) && + is_array($this->indexing_plugins) ) { + $summary[self::SUBDOCS] = array(); + foreach($this->indexing_plugins as $plugin) { + $subdoc = NULL; + $plugin_instance_name = + lcfirst($plugin); + $subdocs_description = + $this->$plugin_instance_name->pageProcessing($page, $url); + if(is_array($subdocs_description) + && count($subdocs_description) != 0) { + foreach($subdocs_description as $subdoc_description) { + $subdoc[self::TITLE] = $subdoc_description[self::TITLE]; + $subdoc[self::DESCRIPTION] = + $subdoc_description[self::DESCRIPTION]; + $subdoc[self::LANG] = $summary[self::LANG]; + $subdoc[self::LINKS] = $summary[self::LINKS]; + $subdoc[self::PAGE] = $page; + $subdoc[self::SUBDOCTYPE] = lcfirst( + substr($plugin, 0, -strlen("Plugin"))); + $summary[self::SUBDOCS][] = $subdoc; + } + } + } + } + return $summary; + } + + /** + * Should be implemented to compute a summary based on a + * text string of a document. This method is called from + * @see handle($page, $url) + * + * @param string $page string of a document + * @param string $url location the document came from + * + * @return array a summary of (title, description,links, and content) of + * the information in $page + */ + abstract function process($page, $url); +} + +?>