Checkin Vijaya code, a=vijaya

unknown [2011-05-26 21:May:th]
Checkin Vijaya code, a=vijaya
Filename
bin/queue_server.php
configs/config.php
controllers/toolbar_controller.php
index.php
diff --git a/bin/queue_server.php b/bin/queue_server.php
index a7f53db38..0dd08db89 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -75,6 +75,8 @@ require_once BASE_DIR."/lib/fetch_url.php";
 /** Loads common constants for web crawling*/
 require_once BASE_DIR."/lib/crawl_constants.php";

+require_once BASE_DIR."/lib/phrase_parser.php";
+
 /*
  *  We'll set up multi-byte string handling to use UTF-8
  */
@@ -257,7 +259,9 @@ class QueueServer implements CrawlConstants

             //check for orphaned queue bundles
             $this->deleteOrphanedBundles();
-
+
+			//check for toolbardata
+			$this->processToolbarData();

             $this->processIndexData();
             if(time() - $this->last_index_save_time > FORCE_SAVE_TIME){
@@ -673,6 +677,119 @@ class QueueServer implements CrawlConstants
         crawlLog("done.");
     }

+    /**
+     * Sets up the directory to look for a file of unprocessed
+     * index archive data from toolbar then calls the function
+     * processDataFile to process the oldest file found
+     */
+    function processToolbarData()
+   {
+       echo " In the function processToolbarData";
+       crawlLog("Checking for toolbar data files to process...");
+
+       $index_dir =  CRAWL_DIR."/schedules/".
+           "ToolbarData";
+       $this->processDataFile($index_dir, "processToolbarDataInvertedIndex");
+       crawlLog("done.");
+	   echo " End of the function processToolbarData";
+   }
+
+   /**
+    * Builds the MiniInvertedIndex for the files recived from
+    * extension toolbar then adds it to the INVERTED INDEX.
+	*
+	* @param string $file gets the toolbar file contents to process
+	* toolbarshard.
+    */
+    function processToolbarDataInvertedIndex($file)
+    {
+        echo " In the function processToolbarDataInvertedIndex";
+        static $first = true;
+        crawlLog(
+            "Start processing toolbar data memory usage".
+            memory_get_usage() . "...");
+        crawlLog("Processing toolbar data in $file...");
+
+        $start_time = microtime();
+        $rowdelimiter = ",";
+        $delimiter = "|:|";
+        $filecontent = file_get_contents($file);
+
+        $rows = explode($rowdelimiter, $filecontent);
+
+        foreach ($rows as $newrow) {
+            $tok = explode($delimiter, $newrow);
+            $site[self::LINKS][$tok[2]]= $tok[0];
+            $site[self::TIMESTAMP]= $tok[3];
+            $site[self::ENCODING]= $tok[4];
+        }
+
+        $toolbar_shard = new IndexShard("toolbar_shard");
+        $seen_sites = array();
+        foreach($site[self::LINKS] as $url => $link_text) {
+            if(strlen($url) > 0) {
+                $summary = array();
+
+                $had_links = true;
+
+                $link_text = strip_tags($link_text);
+                $link_id =
+                    "url|".$url."|text|$link_text|ref|".$site[self::URL];
+
+                $link_keys = crawlHash($url, true) .
+                      crawlHash($link_id, true) .
+                      crawlHash("info:".$url, "true");
+
+                $summary[self::HASH_URL] =  $link_keys;
+		        $summary[self::URL] =  $link_id;
+                $summary[self::TITLE] = $url;
+                   // stripping html to be on the safe side
+                $summary[self::DESCRIPTION] =  $link_text;
+                $summary[self::TIMESTAMP] =  $site[self::TIMESTAMP];
+                $summary[self::ENCODING] = $site[self::ENCODING];
+                $summary[self::HASH] =  $link_id;
+                $summary[self::TYPE] = "link";
+                $summary[self::HTTP_CODE] = "link";
+                $seen_sites[] = $summary;
+
+                $link_text =
+                    mb_ereg_replace(PUNCT, " ", $link_text);
+
+                $link_word_counts =
+                    PhraseParser::extractPhrasesAndCount($link_text,
+                    MAX_PHRASE_LEN, $lang);
+
+                $toolbar_shard->addDocumentWords($link_keys,
+                    self::NEEDS_OFFSET_FLAG,
+                    $link_word_counts, array());
+            }
+        }
+
+        $visited_urls_count = 0;
+        $generation =
+             $this->index_archive->initGenerationToAdd($toolbar_shard);
+
+        $summary_offsets = array();
+        if(isset($seen_sites)) {
+            $this->index_archive->addPages(
+                $generation, self::SUMMARY_OFFSET, $seen_sites,
+                $visited_urls_count);
+
+            foreach($seen_sites as $site) {
+                $hash = $site[self::HASH_URL];
+                $dict_word =  NULL;
+                $summary_offsets[$hash] =
+                    array($site[self::SUMMARY_OFFSET], $dict_word);
+            }
+        }
+        $toolbar_shard->changeDocumentOffsets($summary_offsets);
+        $this->index_archive->addIndexData($toolbar_shard);
+        $this->index_dirty = true;
+		unlink($file);
+
+    }
+
+
     /**
      * Adds the summary and index data in $file to summary bundle and word index
      *
@@ -781,6 +898,7 @@ class QueueServer implements CrawlConstants
         crawlLog("D (add index shard) memory usage".memory_get_usage().
             " time: ".(changeInMicrotime($start_time)));

+
         crawlLog("Done Processing File: $file");

         unlink($file);
diff --git a/configs/config.php b/configs/config.php
index d039463db..bd79d56c6 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -50,7 +50,7 @@ if(file_exists(BASE_DIR."/configs/local_config.php")) {
 if(!defined('WORK_DIRECTORY')) {
 /*+++ The next block of code is machine edited, change at
 your own risk, please use configure web page instead +++*/
-define('WORK_DIRECTORY', '');
+define('WORK_DIRECTORY', 'c:/xampp/xampp/htdocs/yioop_data');
 /*++++++*/
 }

diff --git a/controllers/toolbar_controller.php b/controllers/toolbar_controller.php
new file mode 100644
index 000000000..aa636817a
--- /dev/null
+++ b/controllers/toolbar_controller.php
@@ -0,0 +1,134 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage controller
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/** Load base controller class if needed */
+require_once BASE_DIR."/controllers/controller.php";
+/** Loads common constants for web crawling*/
+require_once BASE_DIR."/lib/crawl_constants.php";
+
+/**
+ * This class handles data coming to a queue_server from a fetcher
+ * Basically, it receives the data from the fetcher and saves it into
+ * various files for later processing by the queue server.
+ * This class can also be used by a fetcher to get status information.
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage controller
+ */
+class ToolbarController extends Controller implements CrawlConstants
+{
+    /**
+     * No models used by this controller
+     * @var array
+     */
+    var $models = array();
+    /**
+     * Load FetchView to return results to fetcher
+     * @var array
+     */
+    var $views = array("fetch");
+    /**
+     * These are the activities supported by this controller
+     * @var array
+     */
+    var $activities = array("toolbarTraffic");
+
+
+    /**
+     * Checks that the request seems to be coming from a legitimate fetcher then
+     * determines which activity the fetcher is requesting and calls that
+     * activity for processing.
+     *
+     */
+    function processRequest()
+    {
+        $data = array();
+
+        /* do a quick test to see if this is a request seems like
+           from a legitimate machine
+         */
+
+
+        $activity = $_REQUEST['a'];
+        //echo "OK";
+        if(in_array($activity, $this->activities)) {$this->$activity();}
+
+    }
+    /**
+     * Adds a file with contents $data and with name containing $address and
+     * $time to a subfolder $day of a folder $dir
+     *
+     * @param string &$data_string encoded, compressed, serialized data the
+     *      schedule is to contain
+     */
+
+    function toolbarTraffic(&$data_string)
+    {
+        $toolbar_data = $_POST["b"];
+        $time = time();
+
+        $dir = CRAWL_DIR."/schedules/"."ToolbarData";
+
+        //echo "$dir";
+
+        $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']);
+        $address = str_replace(":", "_", $address);
+        //$time = time();
+        $day = floor($time/86400);
+
+        if(!file_exists($dir)) {
+            mkdir($dir);
+            chmod($dir, 0777);
+        }
+
+        $dir .= "/$day";
+        if(!file_exists($dir)) {
+            mkdir($dir);
+            chmod($dir, 0777);
+        }
+        $data_hash = crawlHash($data_string);
+
+        $fname= $dir."/At".$time."From".$address."WithHash$data_hash.txt";
+
+        $fh = fopen($fname, "a+");
+        fwrite($fh, $toolbar_data);
+        fclose($fh);
+        //echo "OK TEST";
+        return true;
+
+    }
+}
+?>
diff --git a/index.php b/index.php
index 0a3911ae8..353006328 100755
--- a/index.php
+++ b/index.php
@@ -1,5 +1,5 @@
 <?php
-/**
+/**
  *  SeekQuarry/Yioop --
  *  Open Source Pure PHP Search Engine, Crawler, and Indexer
  *
@@ -21,7 +21,7 @@
  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
  *  END LICENSE
- *
+ *
  * Main web interface entry point for Yioop!
  * search site. Used to both get and display
  * search results. Also used for inter-machine
@@ -35,7 +35,7 @@
  * @filesource
  */

-/** Calculate base directory of script
+/** Calculate base directory of script
  *  @ignore
  */
 define("BASE_DIR", substr($_SERVER['SCRIPT_FILENAME'], 0,-strlen("index.php")));
@@ -46,12 +46,12 @@ define("BASE_DIR", substr($_SERVER['SCRIPT_FILENAME'], 0,-strlen("index.php")));
 require_once(BASE_DIR.'configs/config.php');
 ini_set("memory_limit","500M");
 header("X-FRAME-OPTIONS: DENY"); //prevent click jacking
-session_name(SESSION_NAME);
+session_name(SESSION_NAME);
 session_start();
 /**
  * Sets up DB to be used
  */
-require_once(BASE_DIR."/models/datasources/".DBMS."_manager.php");
+require_once(BASE_DIR."/models/datasources/".DBMS."_manager.php");

 if(USE_MEMCACHE) {
     $MEMCACHE = new Memcache();
@@ -76,10 +76,10 @@ if ( false === function_exists('lcfirst') ) {
      */
     function lcfirst( $str )
     { return (string)(strtolower(substr($str,0,1)).substr($str,1));}
-}
+}

-$available_controllers = array("search", "fetch", "cache",
-    "settings", "admin", "archive");
+$available_controllers = array("search", "fetch", "cache",
+    "settings", "admin", "archive","toolbar");

 //the request variable c is used to determine the controller
 if(!isset($_REQUEST['c'])) {
@@ -98,7 +98,7 @@ if(!PROFILE ) {
     $controller_name = "admin";
 }

-//the request variable l is used to determine the locale
+//the request variable l is used to determine the locale
 if(isset($_SESSION['l']) ||isset($_REQUEST['l'])) {
     $l = (isset($_REQUEST['l'])) ? $_REQUEST['l'] : $_SESSION['l'];
     if(strlen($l) < 10) {
@@ -125,9 +125,9 @@ setLocaleObject($locale_tag);


 /**
- * Loads controller responsible for calculating
+ * Loads controller responsible for calculating
  * the data needed to render the scene
- *
+ *
  */
 require_once(BASE_DIR."/controllers/".$controller_name."_controller.php");
 $controller_class = ucfirst($controller_name)."Controller";
@@ -139,7 +139,7 @@ $controller->processRequest();
  * Verifies that the supplied controller string is a controller for the
  * SeekQuarry app
  *
- * @param string $controller_name  name of controller
+ * @param string $controller_name  name of controller
  *      (this usually come from the query string)
  * @return bool  whether it is a valid controller
  */
@@ -186,7 +186,7 @@ function tl()
 /**
  * Sets the language to be used for locale settings
  *
- * @param string $locale_tag the tag of the language to use to determine
+ * @param string $locale_tag the tag of the language to use to determine
  *      locale settings
  */
 function setLocaleObject($locale_tag)
@@ -197,10 +197,10 @@ function setLocaleObject($locale_tag)
 }

 /**
- * Gets the language tag (for instance, en_US for American English) of the
+ * Gets the language tag (for instance, en_US for American English) of the
  * locale that is currently being used.
  *
- * @return string  the tag of the language currently being used for locale
+ * @return string  the tag of the language currently being used for locale
  *      settings
  */
 function getLocaleTag()
@@ -210,9 +210,9 @@ function getLocaleTag()
 }

 /**
- * Returns the current language directions.
+ * Returns the current language directions.
  *
- * @return string ltr or rtl depending on if the language is left-to-right
+ * @return string ltr or rtl depending on if the language is left-to-right
  * or right-to-left
  */
 function getLocaleDirection()
@@ -222,9 +222,9 @@ function getLocaleDirection()
 }

 /**
- * Returns the current locales method of writing blocks (things like divs or
- * paragraphs).A language like English puts blocks one after another from the
- * top of the page to the bottom. Other languages like classical Chinese list
+ * Returns the current locales method of writing blocks (things like divs or
+ * paragraphs).A language like English puts blocks one after another from the
+ * top of the page to the bottom. Other languages like classical Chinese list
  * them from right to left.
  *
  *  @return string  tb lr rl depending on the current locales block progression
@@ -237,8 +237,8 @@ function getBlockProgression()
 }

 /**
- * Returns the writing mode of the current locale. This is a combination of the
- * locale direction and the block progression. For instance, for English the
+ * Returns the writing mode of the current locale. This is a combination of the
+ * locale direction and the block progression. For instance, for English the
  * writing mode is lr-tb (left-to-right top-to-bottom).
  *
  *  @return string   the locales writing mode
ViewGit