diff --git a/bin/queue_server.php b/bin/queue_server.php index a7f53db38..0dd08db89 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -75,6 +75,8 @@ require_once BASE_DIR."/lib/fetch_url.php"; /** Loads common constants for web crawling*/ require_once BASE_DIR."/lib/crawl_constants.php"; +require_once BASE_DIR."/lib/phrase_parser.php"; + /* * We'll set up multi-byte string handling to use UTF-8 */ @@ -257,7 +259,9 @@ class QueueServer implements CrawlConstants //check for orphaned queue bundles $this->deleteOrphanedBundles(); - + + //check for toolbardata + $this->processToolbarData(); $this->processIndexData(); if(time() - $this->last_index_save_time > FORCE_SAVE_TIME){ @@ -673,6 +677,119 @@ class QueueServer implements CrawlConstants crawlLog("done."); } + /** + * Sets up the directory to look for a file of unprocessed + * index archive data from toolbar then calls the function + * processDataFile to process the oldest file found + */ + function processToolbarData() + { + echo " In the function processToolbarData"; + crawlLog("Checking for toolbar data files to process..."); + + $index_dir = CRAWL_DIR."/schedules/". + "ToolbarData"; + $this->processDataFile($index_dir, "processToolbarDataInvertedIndex"); + crawlLog("done."); + echo " End of the function processToolbarData"; + } + + /** + * Builds the MiniInvertedIndex for the files recived from + * extension toolbar then adds it to the INVERTED INDEX. + * + * @param string $file gets the toolbar file contents to process + * toolbarshard. + */ + function processToolbarDataInvertedIndex($file) + { + echo " In the function processToolbarDataInvertedIndex"; + static $first = true; + crawlLog( + "Start processing toolbar data memory usage". + memory_get_usage() . "..."); + crawlLog("Processing toolbar data in $file..."); + + $start_time = microtime(); + $rowdelimiter = ","; + $delimiter = "|:|"; + $filecontent = file_get_contents($file); + + $rows = explode($rowdelimiter, $filecontent); + + foreach ($rows as $newrow) { + $tok = explode($delimiter, $newrow); + $site[self::LINKS][$tok[2]]= $tok[0]; + $site[self::TIMESTAMP]= $tok[3]; + $site[self::ENCODING]= $tok[4]; + } + + $toolbar_shard = new IndexShard("toolbar_shard"); + $seen_sites = array(); + foreach($site[self::LINKS] as $url => $link_text) { + if(strlen($url) > 0) { + $summary = array(); + + $had_links = true; + + $link_text = strip_tags($link_text); + $link_id = + "url|".$url."|text|$link_text|ref|".$site[self::URL]; + + $link_keys = crawlHash($url, true) . + crawlHash($link_id, true) . + crawlHash("info:".$url, "true"); + + $summary[self::HASH_URL] = $link_keys; + $summary[self::URL] = $link_id; + $summary[self::TITLE] = $url; + // stripping html to be on the safe side + $summary[self::DESCRIPTION] = $link_text; + $summary[self::TIMESTAMP] = $site[self::TIMESTAMP]; + $summary[self::ENCODING] = $site[self::ENCODING]; + $summary[self::HASH] = $link_id; + $summary[self::TYPE] = "link"; + $summary[self::HTTP_CODE] = "link"; + $seen_sites[] = $summary; + + $link_text = + mb_ereg_replace(PUNCT, " ", $link_text); + + $link_word_counts = + PhraseParser::extractPhrasesAndCount($link_text, + MAX_PHRASE_LEN, $lang); + + $toolbar_shard->addDocumentWords($link_keys, + self::NEEDS_OFFSET_FLAG, + $link_word_counts, array()); + } + } + + $visited_urls_count = 0; + $generation = + $this->index_archive->initGenerationToAdd($toolbar_shard); + + $summary_offsets = array(); + if(isset($seen_sites)) { + $this->index_archive->addPages( + $generation, self::SUMMARY_OFFSET, $seen_sites, + $visited_urls_count); + + foreach($seen_sites as $site) { + $hash = $site[self::HASH_URL]; + $dict_word = NULL; + $summary_offsets[$hash] = + array($site[self::SUMMARY_OFFSET], $dict_word); + } + } + $toolbar_shard->changeDocumentOffsets($summary_offsets); + $this->index_archive->addIndexData($toolbar_shard); + $this->index_dirty = true; + unlink($file); + + } + + /** * Adds the summary and index data in $file to summary bundle and word index * @@ -781,6 +898,7 @@ class QueueServer implements CrawlConstants crawlLog("D (add index shard) memory usage".memory_get_usage(). " time: ".(changeInMicrotime($start_time))); + crawlLog("Done Processing File: $file"); unlink($file); diff --git a/configs/config.php b/configs/config.php index d039463db..bd79d56c6 100755 --- a/configs/config.php +++ b/configs/config.php @@ -50,7 +50,7 @@ if(file_exists(BASE_DIR."/configs/local_config.php")) { if(!defined('WORK_DIRECTORY')) { /*+++ The next block of code is machine edited, change at your own risk, please use configure web page instead +++*/ -define('WORK_DIRECTORY', ''); +define('WORK_DIRECTORY', 'c:/xampp/xampp/htdocs/yioop_data'); /*++++++*/ } diff --git a/controllers/toolbar_controller.php b/controllers/toolbar_controller.php new file mode 100644 index 000000000..aa636817a --- /dev/null +++ b/controllers/toolbar_controller.php @@ -0,0 +1,134 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage controller + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** Load base controller class if needed */ +require_once BASE_DIR."/controllers/controller.php"; +/** Loads common constants for web crawling*/ +require_once BASE_DIR."/lib/crawl_constants.php"; + +/** + * This class handles data coming to a queue_server from a fetcher + * Basically, it receives the data from the fetcher and saves it into + * various files for later processing by the queue server. + * This class can also be used by a fetcher to get status information. + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage controller + */ +class ToolbarController extends Controller implements CrawlConstants +{ + /** + * No models used by this controller + * @var array + */ + var $models = array(); + /** + * Load FetchView to return results to fetcher + * @var array + */ + var $views = array("fetch"); + /** + * These are the activities supported by this controller + * @var array + */ + var $activities = array("toolbarTraffic"); + + + /** + * Checks that the request seems to be coming from a legitimate fetcher then + * determines which activity the fetcher is requesting and calls that + * activity for processing. + * + */ + function processRequest() + { + $data = array(); + + /* do a quick test to see if this is a request seems like + from a legitimate machine + */ + + + $activity = $_REQUEST['a']; + //echo "OK"; + if(in_array($activity, $this->activities)) {$this->$activity();} + + } + /** + * Adds a file with contents $data and with name containing $address and + * $time to a subfolder $day of a folder $dir + * + * @param string &$data_string encoded, compressed, serialized data the + * schedule is to contain + */ + + function toolbarTraffic(&$data_string) + { + $toolbar_data = $_POST["b"]; + $time = time(); + + $dir = CRAWL_DIR."/schedules/"."ToolbarData"; + + //echo "$dir"; + + $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']); + $address = str_replace(":", "_", $address); + //$time = time(); + $day = floor($time/86400); + + if(!file_exists($dir)) { + mkdir($dir); + chmod($dir, 0777); + } + + $dir .= "/$day"; + if(!file_exists($dir)) { + mkdir($dir); + chmod($dir, 0777); + } + $data_hash = crawlHash($data_string); + + $fname= $dir."/At".$time."From".$address."WithHash$data_hash.txt"; + + $fh = fopen($fname, "a+"); + fwrite($fh, $toolbar_data); + fclose($fh); + //echo "OK TEST"; + return true; + + } +} +?> diff --git a/index.php b/index.php index 0a3911ae8..353006328 100755 --- a/index.php +++ b/index.php @@ -1,5 +1,5 @@ <?php -/** +/** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * @@ -21,7 +21,7 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. * * END LICENSE - * + * * Main web interface entry point for Yioop! * search site. Used to both get and display * search results. Also used for inter-machine @@ -35,7 +35,7 @@ * @filesource */ -/** Calculate base directory of script +/** Calculate base directory of script * @ignore */ define("BASE_DIR", substr($_SERVER['SCRIPT_FILENAME'], 0,-strlen("index.php"))); @@ -46,12 +46,12 @@ define("BASE_DIR", substr($_SERVER['SCRIPT_FILENAME'], 0,-strlen("index.php"))); require_once(BASE_DIR.'configs/config.php'); ini_set("memory_limit","500M"); header("X-FRAME-OPTIONS: DENY"); //prevent click jacking -session_name(SESSION_NAME); +session_name(SESSION_NAME); session_start(); /** * Sets up DB to be used */ -require_once(BASE_DIR."/models/datasources/".DBMS."_manager.php"); +require_once(BASE_DIR."/models/datasources/".DBMS."_manager.php"); if(USE_MEMCACHE) { $MEMCACHE = new Memcache(); @@ -76,10 +76,10 @@ if ( false === function_exists('lcfirst') ) { */ function lcfirst( $str ) { return (string)(strtolower(substr($str,0,1)).substr($str,1));} -} +} -$available_controllers = array("search", "fetch", "cache", - "settings", "admin", "archive"); +$available_controllers = array("search", "fetch", "cache", + "settings", "admin", "archive","toolbar"); //the request variable c is used to determine the controller if(!isset($_REQUEST['c'])) { @@ -98,7 +98,7 @@ if(!PROFILE ) { $controller_name = "admin"; } -//the request variable l is used to determine the locale +//the request variable l is used to determine the locale if(isset($_SESSION['l']) ||isset($_REQUEST['l'])) { $l = (isset($_REQUEST['l'])) ? $_REQUEST['l'] : $_SESSION['l']; if(strlen($l) < 10) { @@ -125,9 +125,9 @@ setLocaleObject($locale_tag); /** - * Loads controller responsible for calculating + * Loads controller responsible for calculating * the data needed to render the scene - * + * */ require_once(BASE_DIR."/controllers/".$controller_name."_controller.php"); $controller_class = ucfirst($controller_name)."Controller"; @@ -139,7 +139,7 @@ $controller->processRequest(); * Verifies that the supplied controller string is a controller for the * SeekQuarry app * - * @param string $controller_name name of controller + * @param string $controller_name name of controller * (this usually come from the query string) * @return bool whether it is a valid controller */ @@ -186,7 +186,7 @@ function tl() /** * Sets the language to be used for locale settings * - * @param string $locale_tag the tag of the language to use to determine + * @param string $locale_tag the tag of the language to use to determine * locale settings */ function setLocaleObject($locale_tag) @@ -197,10 +197,10 @@ function setLocaleObject($locale_tag) } /** - * Gets the language tag (for instance, en_US for American English) of the + * Gets the language tag (for instance, en_US for American English) of the * locale that is currently being used. * - * @return string the tag of the language currently being used for locale + * @return string the tag of the language currently being used for locale * settings */ function getLocaleTag() @@ -210,9 +210,9 @@ function getLocaleTag() } /** - * Returns the current language directions. + * Returns the current language directions. * - * @return string ltr or rtl depending on if the language is left-to-right + * @return string ltr or rtl depending on if the language is left-to-right * or right-to-left */ function getLocaleDirection() @@ -222,9 +222,9 @@ function getLocaleDirection() } /** - * Returns the current locales method of writing blocks (things like divs or - * paragraphs).A language like English puts blocks one after another from the - * top of the page to the bottom. Other languages like classical Chinese list + * Returns the current locales method of writing blocks (things like divs or + * paragraphs).A language like English puts blocks one after another from the + * top of the page to the bottom. Other languages like classical Chinese list * them from right to left. * * @return string tb lr rl depending on the current locales block progression @@ -237,8 +237,8 @@ function getBlockProgression() } /** - * Returns the writing mode of the current locale. This is a combination of the - * locale direction and the block progression. For instance, for English the + * Returns the writing mode of the current locale. This is a combination of the + * locale direction and the block progression. For instance, for English the * writing mode is lr-tb (left-to-right top-to-bottom). * * @return string the locales writing mode