Fixes Bug14, a=chris

Chris Pollett [2010-09-10 06:Sep:th]

Fixes Bug14, a=chris

Filename
bin/fetcher.php
bin/queue_server.php
configs/config.php
controllers/admin_controller.php
controllers/fetch_controller.php
index.php
lib/crawl_constants.php
lib/index_archive_bundle.php
lib/persistent_structure.php
lib/phrase_parser.php
lib/processors/text_processor.php
lib/string_array.php
lib/web_queue_bundle.php
models/phrase_model.php

diff --git a/bin/fetcher.php b/bin/fetcher.php
index e1bc72a2d..a29b187c5 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -36,7 +36,7 @@ define("BASE_DIR", substr(
     dirname(realpath($_SERVER['PHP_SELF'])), 0,
     -strlen("/bin")));

-ini_set("memory_limit","600M"); //so have enough memory to crawl big pages
+ini_set("memory_limit","700M"); //so have enough memory to crawl big pages

 /** Load in global configuration settings */
 require_once BASE_DIR.'/configs/config.php';
@@ -333,6 +333,7 @@ class Fetcher implements CrawlConstants
             $this->found_duplicates = array_merge($this->found_duplicates,
                 $duplicates);
             if($can_schedule_again == true) {
+                //only schedule to crawl again on fail sites without crawl-delay
                 foreach($schedule_again_pages as $schedule_again_page) {
                     if($schedule_again_page[self::CRAWL_DELAY] == 0) {
                         $this->to_crawl_again[] =
@@ -437,15 +438,27 @@ class Fetcher implements CrawlConstants
         $request =
             $queue_server."?c=fetch&a=schedule&time=$time&session=$session";

-        $info_string = FetchUrl::getPage($request);
-        $info = unserialize(trim($info_string));
+        $info_string = trim(FetchUrl::getPage($request));
+        $tok = strtok($info_string, "\n");
+        $info = unserialize(base64_decode($tok));

         if(isset($info[self::CRAWL_ORDER])) {
             $this->crawl_order = $info[self::CRAWL_ORDER];
         }

         if(isset($info[self::SITES])) {
-            $this->to_crawl = $info[self::SITES];
+            $this->to_crawl = array();
+            while($tok !== false) {
+                $string = base64_decode($tok);
+                $tmp = unpack("f", substr($string, 0 , 4));
+                $weight = $tmp[1];
+                $tmp = unpack("N", substr($string, 4 , 4));
+                $delay = $tmp[1];
+                $url = substr($string, 8);
+                $this->to_crawl[] = array($url, $weight, $delay);
+
+                $tok = strtok("\n");
+            }
         }

         if(isset($info[self::SCHEDULE_TIME])) {
@@ -645,8 +658,7 @@ class Fetcher implements CrawlConstants
                 $site[self::URL]);

             if($doc_info) {
-
-                $site[self::DOC_INFO] = $doc_info;
+                $site[self::DOC_INFO] =  $doc_info;

                 if(!is_dir(CRAWL_DIR."/cache")) {
                     mkdir(CRAWL_DIR."/cache");
@@ -888,26 +900,95 @@ class Fetcher implements CrawlConstants


         if(count($this->to_crawl) <= 0) {
-            $this->found_sites[self::SCHEDULE_TIME] = $this->schedule_time;
+            $schedule_time = $this->schedule_time;
         }

+        /*
+            In what follows as we generate post data we delete stuff
+            from $this->found_sites, to try to minimize our memory
+            footprint.
+         */
+        $bytes_to_send = 0;
+        $post_data = array('c'=>'fetch', 'a'=>'update',
+            'crawl_time' => $this->crawl_time, 'machine_uri' => WEB_URI);
+
+        //handle robots.txt data
+        if(isset($this->found_sites[self::ROBOT_TXT])) {
+            $post_data['robot_data'] = urlencode(base64_encode(
+                gzcompress(serialize($this->found_sites[self::ROBOT_TXT]))));
+            unset($this->found_sites[self::ROBOT_TXT]);
+            $bytes_to_send += strlen($post_data['robot_data']);
+        }
+
+        //handle schedule data
+        $schedule_data = array();
+        if(isset($this->found_sites[self::TO_CRAWL])) {
+            $schedule_data[self::TO_CRAWL] = &
+                $this->found_sites[self::TO_CRAWL];
+        }
+        unset($this->found_sites[self::TO_CRAWL]);
+
         if(isset($this->found_sites[self::SEEN_URLS]) &&
             count($this->found_sites[self::SEEN_URLS]) > 0 ) {
-            $this->buildMiniInvertedIndex();
+            $hash_seen_urls = array();
+            $recent_urls = array();
+            $cnt = 0;
+            foreach($this->found_sites[self::SEEN_URLS] as $site) {
+                $hash_seen_urls[] =
+                    crawlHash($site[self::URL], true);
+                if(strpos($site[self::URL], "url|") !== 0) {
+                    array_push($recent_urls, $site[self::URL]);
+                    if($cnt >= NUM_RECENT_URLS_TO_DISPLAY)
+                    {
+                        array_shift($recent_urls);
+                    }
+                    $cnt++;
+                }
+            }
+            $schedule_data[self::HASH_SEEN_URLS] = & $hash_seen_urls;
+            unset($hash_seen_urls);
+            $schedule_data[self::RECENT_URLS] = & $recent_urls;
+            unset($recent_urls);
         }
+        if(!empty($schedule_data)) {
+            if(isset($schedule_time)) {
+                $schedule_data[self::SCHEDULE_TIME] = $schedule_time;
+            }
+            $post_data['schedule_data'] = urlencode(base64_encode(
+                gzcompress(serialize($schedule_data))));
+            $bytes_to_send += strlen($post_data['schedule_data']);
+        }
+        unset($schedule_data);

-        $post_data = array('c'=>'fetch', 'a'=>'update',
-            'crawl_time' => $this->crawl_time, 'machine_uri' => WEB_URI);
-
-        $post_data['found'] = urlencode(base64_encode(
-            gzcompress(serialize($this->found_sites))));
-        $bytes_to_send = strlen($post_data['found']);
+        //handle mini inverted index
+        if(isset($this->found_sites[self::SEEN_URLS]) &&
+            count($this->found_sites[self::SEEN_URLS]) > 0 ) {
+            $this->buildMiniInvertedIndex();
+        }
+        if(isset($this->found_sites[self::INVERTED_INDEX])) {
+            $index_data = array();
+            $index_data[self::SEEN_URLS] = &
+                $this->found_sites[self::SEEN_URLS];
+            unset($this->found_sites[self::SEEN_URLS]);
+            $index_data[self::INVERTED_INDEX] = &
+                $this->found_sites[self::INVERTED_INDEX];
+            unset($this->found_sites[self::INVERTED_INDEX]);
+            $post_data['index_data'] = urlencode(base64_encode(
+                gzcompress(serialize($index_data))));
+            unset($index_data);
+            $bytes_to_send += strlen($post_data['index_data']);
+        }

         $this->found_sites = array(); // reset found_sites so have more space.
+        if($bytes_to_send <= 0) {
+            crawlLog("No data to send aborting update scheduler...");
+            return;
+        }

+        //try to send to queue server
         $sleep = false;
         do {
-
+
             if($sleep == true) {
                 crawlLog("Trouble sending to the scheduler\n $info_string...");
                 sleep(5);
@@ -927,7 +1008,10 @@ class Fetcher implements CrawlConstants
             $info = unserialize(trim($info_string));
             crawlLog("Queue Server info response code: ".$info[self::STATUS]);
             crawlLog("Queue Server's crawl time is: ".$info[self::CRAWL_TIME]);
-
+            crawlLog("Web Server peak memory usage: ".
+                $info[self::MEMORY_USAGE]);
+            crawlLog("This fetcher peak memory usage: ".
+                memory_get_peak_usage());
         } while(!isset($info[self::STATUS]) ||
             $info[self::STATUS] != self::CONTINUE_STATE);

diff --git a/bin/queue_server.php b/bin/queue_server.php
index 06882c111..f0e606f09 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -484,14 +484,21 @@ class QueueServer implements CrawlConstants
         $start_time = microtime();

         $index_archive = $this->index_archive;
-        $sites = unserialize(file_get_contents($file));
-
+        $fh = fopen($file, "rb");
+        $machine_string = fgets($fh);
+        $len = strlen($machine_string);
+        $machine_info = unserialize(base64_decode($machine_string));
+        $sites = unserialize(gzuncompress(base64_decode(
+            urldecode(fread($fh, filesize($file) - $len))
+            )));
+        fclose($fh);
+
         crawlLog("A memory usage".memory_get_usage() .
           " time: ".(changeInMicrotime($start_time)));
         $start_time = microtime();

-        $machine = $sites[self::MACHINE];
-        $machine_uri = $sites[self::MACHINE_URI];
+        $machine = $machine_info[self::MACHINE];
+        $machine_uri = $machine_info[self::MACHINE_URI];

         if(isset($sites[self::SEEN_URLS]) &&
             count($sites[self::SEEN_URLS]) > 0) {
@@ -606,9 +613,15 @@ class QueueServer implements CrawlConstants
         crawlLog("Processing Robots data in $file");
         $start_time = microtime();

-        $sites = unserialize(file_get_contents($file));
+        $fh = fopen($file, "rb");
+        $machine_string = fgets($fh);
+        $len = strlen($machine_string);
+        unset($machine_string);
+        $sites = unserialize(gzuncompress(base64_decode(
+            urldecode(fread($fh, filesize($file) - $len))
+            )));
+        fclose($fh);

-
         if(isset($sites)) {
             foreach($sites as $robot_host => $robot_info) {
                 $this->web_queue->addGotRobotTxtFilter($robot_host);
@@ -705,10 +718,20 @@ class QueueServer implements CrawlConstants
     {
         crawlLog("Processing File: $file");

-        $sites = unserialize(file_get_contents($file));
-
-        if(isset($sites[self::MACHINE])) {
-            $this->most_recent_fetcher = $sites[self::MACHINE];
+        $fh = fopen($file, "rb");
+        $machine_string = fgets($fh);
+        $len = strlen($machine_string);
+        if($len > 0) {
+            $machine_info = unserialize(base64_decode($machine_string));
+        }
+        $sites = unserialize(gzuncompress(base64_decode(
+            urldecode(fread($fh, filesize($file) - $len))
+            )));
+        fclose($fh);
+
+        if(isset($machine_info[self::MACHINE])) {
+            $this->most_recent_fetcher = & $machine_info[self::MACHINE];
+            unset($machine_info);
         }

         crawlLog("...Updating Delayed Hosts Array ...");
@@ -730,29 +753,19 @@ class QueueServer implements CrawlConstants
         $start_time = microtime();
         $most_recent_urls = array();

-        if(isset($sites[self::SEEN_URLS])) {
+        if(isset($sites[self::HASH_SEEN_URLS])) {
             $cnt = 0;
-            foreach($sites[self::SEEN_URLS] as $url) {
-                if($this->web_queue->containsUrlQueue($url)) {
-                    crawlLog(
-                        "Removing $url from Queue (shouldn't still be there!)");
-                    $this->web_queue->removeQueue($url);
+            foreach($sites[self::HASH_SEEN_URLS] as $hash_url) {
+                if($this->web_queue->lookupHashTable($hash_url)) {
+                    crawlLog("Removing hash ". base64_encode($hash_url).
+                        " from Queue");
+                    $this->web_queue->removeQueue($hash_url, true);
                 }
-                if(strpos($url, "url|") !== 0) {
-                    array_push($most_recent_urls, $url);
-                    if($cnt >= NUM_RECENT_URLS_TO_DISPLAY)
-                    {
-                        array_shift($most_recent_urls);
-                    }
-                    $cnt++;
-                }
-
             }
         }

         crawlLog(" time: ".(changeInMicrotime($start_time)));

-
         crawlLog("... To Crawl ...");
         $start_time = microtime();
         if(isset($sites[self::TO_CRAWL])) {
@@ -801,7 +814,7 @@ class QueueServer implements CrawlConstants

                 }
             }
-
+
             crawlLog(" time: ".(changeInMicrotime($start_time)));

             crawlLog("C..");
@@ -817,7 +830,9 @@ class QueueServer implements CrawlConstants

         $crawl_status = array();
         $crawl_status['MOST_RECENT_FETCHER'] = $this->most_recent_fetcher;
-        $crawl_status['MOST_RECENT_URLS_SEEN'] = $most_recent_urls;
+        if(isset($sites[self::RECENT_URLS])) {
+            $crawl_status['MOST_RECENT_URLS_SEEN'] = $sites[self::RECENT_URLS];
+        }
         $crawl_status['CRAWL_TIME'] = $this->crawl_time;
         $info_bundle = IndexArchiveBundle::getArchiveInfo(
             CRAWL_DIR.'/cache/'.self::index_data_base_name.$this->crawl_time);
@@ -835,9 +850,11 @@ class QueueServer implements CrawlConstants
         crawlLog("Number of unique pages so far: ".
             $info_bundle['VISITED_URLS_COUNT']);
         crawlLog("Total urls extracted so far: ".$info_bundle['COUNT']);
+        if(isset($sites[self::RECENT_URLS])) {
         crawlLog("Of these, the most recent urls are:");
-        foreach($most_recent_urls as $url) {
-            crawlLog("URL: $url");
+            foreach($sites[self::RECENT_URLS] as $url) {
+                crawlLog("URL: $url");
+            }
         }

     }
@@ -876,12 +893,15 @@ class QueueServer implements CrawlConstants

         $count = $this->web_queue->to_crawl_queue->count;

+        $sites = array();
         $sites[self::CRAWL_TIME] = $this->crawl_time;
         $sites[self::SCHEDULE_TIME] = time();
         $sites[self::SAVED_CRAWL_TIMES] =  $this->getCrawlTimes();
             // fetcher should delete any crawl time not listed here
         $sites[self::CRAWL_ORDER] = $this->crawl_order;
         $sites[self::SITES] = array();
+        $first_line = base64_encode(serialize($sites))."\n";
+

         $delete_urls = array();
         $crawl_delay_hosts = array();
@@ -958,7 +978,7 @@ class QueueServer implements CrawlConstants

                 $delay = $this->web_queue->getCrawlDelay($host_url);
                 $num_waiting = count($this->waiting_hosts);
-
+
                 if($delay > 0 ) {
                     // handle adding a url if there is a crawl delay
                     if((!isset($this->waiting_hosts[crawlHash($host_url)])
@@ -1047,8 +1067,17 @@ class QueueServer implements CrawlConstants
             }
             ksort($sites[self::SITES]);

-            file_put_contents(CRAWL_DIR."/schedules/schedule.txt",
-                serialize($sites));
+            //write schedule to disk
+            $fh = fopen(CRAWL_DIR."/schedules/schedule.txt", "wb");
+            fwrite($fh, $first_line);
+            foreach($sites[self::SITES] as $site) {
+                list($url, $weight, $delay) = $site;
+                $out_string = base64_encode(
+                    pack("f", $weight).pack("N", $delay).$url)."\n";
+                fwrite($fh, $out_string);
+            }
+            fclose($fh);
+
             crawlLog("End Produce Fetch Memory usage".memory_get_usage() );
             crawlLog("Created fetch batch... Queue size is now ".
                 $this->web_queue->to_crawl_queue->count.
@@ -1068,10 +1097,10 @@ class QueueServer implements CrawlConstants
      * This function is used to schedule slots for crawl-delayed host.
      *
      * @param int $index location to begin searching for an empty slot
-     * @param array $arr list of slots to look in
+     * @param array &$arr list of slots to look in
      * @return int index of first available slot
      */
-    function getEarliestSlot($index, $arr)
+    function getEarliestSlot($index, &$arr)
     {
         $cnt = count($arr);

@@ -1130,6 +1159,7 @@ class QueueServer implements CrawlConstants
     function urlMemberSiteArray($url, $site_array)
     {
         $flag = false;
+        if(!is_array($site_array)) {return false;}
         foreach($site_array as $site) {
             $site_parts = mb_split("domain:", $site);
             if(isset($site_parts[1]) &&
diff --git a/configs/config.php b/configs/config.php
index 84b798c4f..2ba10d5e7 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -164,9 +164,6 @@ define('NUM_WORDS_PER_GENERATION', 6*URL_FILTER_SIZE/NUM_INDEX_PARTITIONS);
 /** number of generations to sample in estimating number of urls in a query */
 define('SAMPLE_GENERATIONS', 3);

-/** store inlink data in word inverted index */
-define('STORE_INLINKS_IN_DICTIONARY', false);
-
 /** precision to round floating points document scores */
 define('PRECISION', 10);

diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php
index 4b58245d3..f556e9c3f 100755
--- a/controllers/admin_controller.php
+++ b/controllers/admin_controller.php
@@ -707,13 +707,13 @@ class AdminController extends Controller implements CrawlConstants
                         CRAWL_DIR."/schedules/queue_server_messages.txt",
                         $info_string);

-                    $scheduler_info[self::SEEN_URLS] = array();
+                    $scheduler_info[self::HASH_SEEN_URLS] = array();

                     foreach ($seed_info['seed_sites']['url'] as $site) {
                         $scheduler_info[self::TO_CRAWL][] = array($site, 1.0);
                     }
-                    $scheduler_info[self::ROBOT_TXT] = array();
-                    $scheduler_string = serialize($scheduler_info);
+                    $scheduler_string = "\n".urlencode(base64_encode(
+                        gzcompress(serialize($scheduler_info))));
                     @unlink(CRAWL_DIR."/schedules/schedule.txt");
                     file_put_contents(
                         CRAWL_DIR."/schedules/ScheduleDataStartCrawl.txt",
@@ -737,11 +737,19 @@ class AdminController extends Controller implements CrawlConstants
                 case "resume":
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >".
                         tl('admin_controller_resume_crawl')."</h1>')";
-
+                    $seed_info = $this->crawlModel->getSeedInfo();
                     $info = array();
                     $info[self::STATUS] = "RESUME_CRAWL";
                     $info[self::CRAWL_TIME] =
                         $this->clean($_REQUEST['timestamp'], "int");
+                    $info[self::CRAWL_ORDER] =
+                        $seed_info['general']['crawl_order'];
+                    $info[self::RESTRICT_SITES_BY_URL] =
+                        $seed_info['general']['restrict_sites_by_url'];
+                    $info[self::ALLOWED_SITES] =
+                        $seed_info['allowed_sites']['url'];
+                    $info[self::DISALLOWED_SITES] =
+                        $seed_info['disallowed_sites']['url'];
                     $info_string = serialize($info);
                     file_put_contents(
                         CRAWL_DIR."/schedules/queue_server_messages.txt",
diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php
index 04adb8cd3..9aef34649 100755
--- a/controllers/fetch_controller.php
+++ b/controllers/fetch_controller.php
@@ -103,7 +103,7 @@ class FetchController extends Controller implements CrawlConstants
         } else {
             $info = array();
             $info[self::STATUS] = self::NO_DATA_STATE;
-            $data['MESSAGE'] = serialize($info);
+            $data['MESSAGE'] = base64_encode(serialize($info))."\n";
         }

         $this->displayView($view, $data);
@@ -116,22 +116,30 @@ class FetchController extends Controller implements CrawlConstants
     function update()
     {
         $view = "fetch";
-
-        if(isset($_REQUEST['found'])) {
-            $info =array();
-            $sites = unserialize(gzuncompress(
-                base64_decode(urldecode($_REQUEST['found']))));
-
-            $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']);
-            $address = str_replace(":", "_", $address);
-            $time = time();
-            $day = floor($time/86400);
-
-
-            $this->addRobotSchedules($sites, $address, $day, $time);
-            $this->addToCrawlSchedules($sites, $address, $day, $time);
-            $this->addToIndexSchedules($sites, $address, $day, $time);
+        $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']);
+        $address = str_replace(":", "_", $address);
+        $time = time();
+        $day = floor($time/86400);
+
+        $info_flag = false;
+        if(isset($_REQUEST['robot_data'])) {
+            $this->addScheduleToScheduleDirectory(self::robot_data_base_name,
+                $_REQUEST['robot_data']);
+            $info_flag = true;
+        }
+        if(isset($_REQUEST['schedule_data'])) {
+            $this->addScheduleToScheduleDirectory(self::schedule_data_base_name,
+                $_REQUEST['schedule_data']);
+            $info_flag = true;
+        }
+        if(isset($_REQUEST['index_data'])) {
+            $this->addScheduleToScheduleDirectory(self::index_data_base_name,
+                $_REQUEST['index_data']);
+            $info_flag = true;
+        }

+        if($info_flag == true) {
+            $info =array();
             $info[self::STATUS] = self::CONTINUE_STATE;
             if(file_exists(CRAWL_DIR."/schedules/crawl_status.txt")) {
                 $crawl_status = unserialize(
@@ -139,8 +147,9 @@ class FetchController extends Controller implements CrawlConstants
                 $info[self::CRAWL_TIME] = $crawl_status['CRAWL_TIME'];
             } else {
                 $info[self::CRAWL_TIME] = 0;
-            }
+            }

+            $info[self::MEMORY_USAGE] = memory_get_peak_usage();
             $data = array();
             $data['MESSAGE'] = serialize($info);

@@ -148,125 +157,23 @@ class FetchController extends Controller implements CrawlConstants
         }
     }

-    /**
-     * Adds a file containing the seen sites and inverted index from the
-     * just received $sites array to the schedules folder's index directory's
-     * subfolder for the current crawl time. This file is added in a sub folder
-     * $day and its name contains the $time at which it arrived and the ip
-     * $address from which it arrived. This file will then be process later
-     * by the queue server.
-     *
-     * @param &array $sites a list of seen sites and an inverted inverted index
-     * @param string $address the IP address of the sending machine with . -->_
-     * @param string $day timestamp in seconds converted to days
-     * @param string $time timestamp in seconds
-     */
-    function addToIndexSchedules(&$sites, $address, $day, $time)
-    {
-        if(isset($sites[self::SEEN_URLS])) {
-            $index_sites[self::SEEN_URLS] = $sites[self::SEEN_URLS];
-        }
-        $sites[self::SEEN_URLS] = NULL;
-
-        $index_sites[self::MACHINE] = $_SERVER['REMOTE_ADDR'];
-        $index_sites[self::MACHINE_URI] = $_REQUEST['machine_uri'];
-        if(isset($sites[self::INVERTED_INDEX])) {
-            $index_sites[self::INVERTED_INDEX] = $sites[self::INVERTED_INDEX];
-        }
-        $index_dir =
-            CRAWL_DIR."/schedules/".self::index_data_base_name.
-                $_REQUEST['crawl_time'];
-
-        $this->addScheduleToScheduleDirectory(
-            $index_dir, $index_sites, $address, $day, $time);
-        $sites[self::INVERTED_INDEX] = NULL;
-    }
-
-    /**
-     * Adds a file containing the to-crawl sites from the just received
-     * $sites array to the schedules folder's schedule data directory's
-     * subfolder for the current crawl time. This file is added in a sub folder
-     * $day and its name contains the $time at which it arrived and the ip
-     * $address from which it arrived. This file will then be process later
-     * by the queue server. In addition to to-crawl sites the seen urls
-     * in $sites are also save in the file. They are used to perform a sanity
-     * check on the priority queue by the queue server.
-     *
-     * @param &array $sites a list of seen sites and to crawl sites
-     * @param string $address the IP address of the sending machine with . -->_
-     * @param string $day timestamp in seconds converted to days
-     * @param string $time timestamp in seconds
-     */
-    function addToCrawlSchedules(&$sites, $address, $day, $time)
-    {
-        $base_dir =  CRAWL_DIR."/schedules/".
-            self::schedule_data_base_name.$_REQUEST['crawl_time'];
-        $scheduler_info = array();
-
-        if(isset($sites[self::TO_CRAWL])) {
-            $scheduler_info[self::TO_CRAWL] = $sites[self::TO_CRAWL];
-        }
-
-        $scheduler_info[self::MACHINE] = $_SERVER['REMOTE_ADDR'];
-
-        if(isset($sites[self::SCHEDULE_TIME])) {
-            $scheduler_info[self::SCHEDULE_TIME] = $sites[self::SCHEDULE_TIME];
-        }
-
-        if(isset($sites[self::SEEN_URLS])) {
-            $seen_sites = $sites[self::SEEN_URLS];
-            $num_seen = count($seen_sites);
-
-            for($i = 0; $i < $num_seen; $i++) {
-                $scheduler_info[self::SEEN_URLS][$i] =
-                    $seen_sites[$i][self::URL];
-            }
-        }
-        $this->addScheduleToScheduleDirectory(
-            $base_dir, $scheduler_info, $address, $day, $time);
-        $sites[self::TO_CRAWL] = NULL;
-    }
-
-    /**
-     * Adds a file containing the robot site data from the just received
-     * $sites array to the schedules folder's robot data directory's
-     * subfolder for the current crawl time. This file is added in a sub folder
-     * $day and its name contains the $time at which it arrived and the ip
-     * $address from which it arrived. This file will then be process later
-     * by the queue server.
-     *
-     * @param &array $sites a list of seen sites and an inverted inverted index
-     * @param string $address the IP address of the sending machine with . -->_
-     * @param string $day timestamp in seconds converted to days
-     * @param string $time timestamp in seconds
-     */
-    function addRobotSchedules(&$sites, $address, $day, $time)
-    {
-        $robot_dir =  CRAWL_DIR."/schedules/".
-            self::robot_data_base_name.$_REQUEST['crawl_time'];
-        if(isset($sites[self::ROBOT_TXT])) {
-            $data = $sites[self::ROBOT_TXT];
-        } else {
-            $data = array();
-        }
-        $this->addScheduleToScheduleDirectory(
-            $robot_dir, $data, $address, $day, $time);
-        $sites[self::ROBOT_TXT] = NULL;
-    }
-
-
     /**
      * Adds a file with contents $data and with name containing $address and
      * $time to a subfolder $day of a folder $dir
      *
-     * @param string $dir directory in which to add the schedule file
-     * @param &array $data data that the schedule file is to contain
-     * @param string $address the IP address of the sending machine with . -->_
-     * @param string $day timestamp in seconds converted to days
-     * @param string $time timestamp in seconds
+     * @param string $schedule_name the name of the kind of schedule being saved
+     * @param string &$data_string encoded, compressed, serialized data the
+     *      schedule is to contain
      */
-    function addScheduleToScheduleDirectory($dir, &$data, $address, $day, $time)
+    function addScheduleToScheduleDirectory($schedule_name, &$data_string)
     {
+        $dir = CRAWL_DIR."/schedules/".$schedule_name.$_REQUEST['crawl_time'];
+
+        $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']);
+        $address = str_replace(":", "_", $address);
+        $time = time();
+        $day = floor($time/86400);
+
         if(!file_exists($dir)) {
             mkdir($dir);
             chmod($dir, 0777);
@@ -277,12 +184,18 @@ class FetchController extends Controller implements CrawlConstants
             mkdir($dir);
             chmod($dir, 0777);
         }
-
-        $data_string = serialize($data);
+        $machine_data = array();
+        $machine_data[self::MACHINE] = $_SERVER['REMOTE_ADDR'];
+        $machine_data[self::MACHINE_URI] = $_REQUEST['machine_uri'];
+        $machine_string = base64_encode(serialize($machine_data))."\n";
+
         $data_hash = crawlHash($data_string);
-        file_put_contents(
-            $dir."/At".$time."From".$address.
-            "WithHash$data_hash.txt", $data_string);
+        $fh = fopen($dir."/At".$time."From".$address.
+            "WithHash$data_hash.txt", "wb");
+        fwrite($fh, $machine_string);
+        fwrite($fh, $data_string);
+        fclose($fh);
+
     }

     /**
diff --git a/index.php b/index.php
index bf0120a1d..c94b06887 100755
--- a/index.php
+++ b/index.php
@@ -42,7 +42,7 @@ define("BASE_DIR", substr($_SERVER['SCRIPT_FILENAME'], 0,-strlen("index.php")));
  * Load the configuration file
  */
 require_once(BASE_DIR.'configs/config.php');
-ini_set("memory_limit","450M");
+ini_set("memory_limit","500M");
 header("X-FRAME-OPTIONS: DENY"); //prevent click jacking
 session_name(SESSION_NAME);
 session_start();
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 0a2dcf79e..12a98b732 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -128,5 +128,8 @@ interface CrawlConstants
     const FILETYPE = 'ag';
     const SUMMARY = 'ah';
     const URL_INFO = 'ai';
+    const HASH_SEEN_URLS ='aj';
+    const RECENT_URLS ='ak';
+    const MEMORY_USAGE ='al';
 }
 ?>
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index f048e4128..72f90010c 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -429,7 +429,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
                 $slice_cnt--;
             }
             if($min_common !== NULL) {
-                $out_data[0][$word_key .":". $min_common][self::POINT_BLOCK] = 0;
+                $out_data[
+                    0][$word_key .":". $min_common][self::POINT_BLOCK] = 0;
                 // this index needs to point to previous block with word
             }

diff --git a/lib/persistent_structure.php b/lib/persistent_structure.php
index 40594e750..a471a0d4a 100755
--- a/lib/persistent_structure.php
+++ b/lib/persistent_structure.php
@@ -48,7 +48,8 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 class PersistentStructure
 {
-    /** If not specified in the constructor, this will be the number of operations between saves
+    /** If not specified in the constructor, this will be the number of
+     *  operations between saves
      *  @var int
      */
     const DEFAULT_SAVE_FREQUENCY = 50000;
@@ -72,7 +73,7 @@ class PersistentStructure
      * @param string $fname the name of the file to store the
      *      PersistentStructure in
      * @param int $save_frequency the number of operation before a save If
-     *      <= 0 never save
+     *      <= 0 never check save
      */
     public function __construct($fname,
         $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
@@ -94,7 +95,9 @@ class PersistentStructure
     }

     /**
-     *  Save the PersistentStructure to its filename
+     *  Save the PersistentStructure to its filename
+     *  This method is generic but super memory inefficient, so reimplement
+     *  for subclasses is needed
      */
     public function save()
     {
diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index 967859823..22d2d9008 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -63,9 +63,10 @@ class PhraseParser
      */
     static function extractWordStringPageSummary($page)
     {
-        $title_phrase_string = mb_ereg_replace("[[:punct:]]", " ",
+        $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|";
+        $title_phrase_string = mb_ereg_replace($punct, " ",
             $page[CrawlConstants::TITLE]);
-        $description_phrase_string = mb_ereg_replace("[[:punct:]]", " ",
+        $description_phrase_string = mb_ereg_replace($punct, " ",
             $page[CrawlConstants::DESCRIPTION]);

         $page_string = $title_phrase_string . " " . $description_phrase_string;
@@ -132,7 +133,8 @@ class PhraseParser
     static function extractPhrasesOfLengthOffset($string,
         $phrase_len, $offset)
     {
-       $words = mb_split("[[:space:]]", $string);
+        $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|";
+        $words = mb_split("[[:space:]]|".$punct, $string);

         $stems = array();

diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php
index 8e768c705..a3fae469c 100755
--- a/lib/processors/text_processor.php
+++ b/lib/processors/text_processor.php
@@ -119,9 +119,12 @@ class TextProcessor implements CrawlConstants
             '@((http|https)://([^ \t\r\n\v\f\'\"\;\,\<\>\[\]\{\}\(\)])*)@i';
         $sites = array();
         preg_match_all($pattern, $page, $matches);
+        $i = 0;
         foreach($matches[0] as $url) {
             if(!isset($sites[$url])) {
                 $sites[$url] = strip_tags($url);
+                $i++;
+                if($i >= MAX_LINKS_PER_PAGE) {break;}
             }
         }
         return $sites;
diff --git a/lib/string_array.php b/lib/string_array.php
index 3907eb805..831ea4dd8 100755
--- a/lib/string_array.php
+++ b/lib/string_array.php
@@ -65,7 +65,7 @@ class StringArray extends PersistentStructure
      * Number of bytes of storage need by the string array
      * @var int
      */
-    var $array_size;
+    var $string_array_size;
     /**
      * Character string used to store the packed data of the StringArray
      * @var string
@@ -98,6 +98,39 @@ class StringArray extends PersistentStructure

     }

+    /**
+     *  Load a StringArray from a file
+     *
+     *  @param string the name of the file to load the StringArray from
+     *  @return object the PersistentStructure loaded
+     */
+    public static function load($fname)
+    {
+        $fh = fopen($fname, "rb");
+        $tmp = unpack("N", fread($fh, 4));
+        $array_size = $tmp[1];
+        $array = fread($fh, $array_size);
+        $object = unserialize(fread($fh,
+            filesize($fname) -4 - $array_size));
+        $object->string_array = & $array;
+        fclose($fh);
+        return $object;
+    }
+
+    /**
+     *  Save the StringArray to its filename
+     */
+    public function save()
+    {
+        $fh = fopen($this->filename, "wb");
+        $tmp = & $this->string_array;
+        fwrite($fh, pack("N", $this->string_array_size));
+        fwrite($fh, $this->string_array);
+        unset($this->string_array);
+        fwrite($fh, serialize($this));
+        $this->string_array = & $tmp;
+        fclose($fh);
+    }

     /**
      *  Looks up the ith item in the StringArray
diff --git a/lib/web_queue_bundle.php b/lib/web_queue_bundle.php
index efb06bd13..5b257940b 100755
--- a/lib/web_queue_bundle.php
+++ b/lib/web_queue_bundle.php
@@ -348,11 +348,16 @@ class WebQueueBundle implements Notifier
      * url is scheduled to be crawled. It only deletes the item from
      * the bundles priority queue and hash table -- not from the web archive.
      *
-     * @param string $url the url to delete
+     * @param string $url the url or hash of url to delete
+     * @param bool $isHash flag to say whether or not is the hash of a url
      */
-    function removeQueue($url)
+    function removeQueue($url, $isHash = false)
     {
-        $hash_url = crawlHash($url, true);
+        if($isHash == true) {
+            $hash_url = $url;
+        } else {
+            $hash_url = crawlHash($url, true);
+        }
         $data = $this->lookupHashTable($hash_url);

         if(!$data) {
@@ -653,7 +658,7 @@ class WebQueueBundle implements Notifier
      * The problem is as the table gets reused a lot, it tends to fill up
      * with a lot of deleted entries making lookup times get more and more
      * linear in the hash table size. By rebuilding the table we mitigate
-     * against this problem. By choosing the rebuild frequecy appropriately,
+     * against this problem. By choosing the rebuild frequency appropriately,
      * the amortized cost of this operation is only O(1).
      */
     function rebuildHashTable()
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 84b732abb..e6d2d6db0 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -198,8 +198,8 @@ class PhraseModel extends Model
         $index_archive_name = self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle(
             CRAWL_DIR.'/cache/'.$index_archive_name);
-
-        $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $phrase_string);
+        $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|";
+        $phrase_string = mb_ereg_replace($punct, " ", $phrase_string);
         $phrase_string = preg_replace("/(\s)+/", " ", $phrase_string);

         /*

ViewGit