diff --git a/bin/fetcher.php b/bin/fetcher.php index e1bc72a2d..a29b187c5 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -36,7 +36,7 @@ define("BASE_DIR", substr( dirname(realpath($_SERVER['PHP_SELF'])), 0, -strlen("/bin"))); -ini_set("memory_limit","600M"); //so have enough memory to crawl big pages +ini_set("memory_limit","700M"); //so have enough memory to crawl big pages /** Load in global configuration settings */ require_once BASE_DIR.'/configs/config.php'; @@ -333,6 +333,7 @@ class Fetcher implements CrawlConstants $this->found_duplicates = array_merge($this->found_duplicates, $duplicates); if($can_schedule_again == true) { + //only schedule to crawl again on fail sites without crawl-delay foreach($schedule_again_pages as $schedule_again_page) { if($schedule_again_page[self::CRAWL_DELAY] == 0) { $this->to_crawl_again[] = @@ -437,15 +438,27 @@ class Fetcher implements CrawlConstants $request = $queue_server."?c=fetch&a=schedule&time=$time&session=$session"; - $info_string = FetchUrl::getPage($request); - $info = unserialize(trim($info_string)); + $info_string = trim(FetchUrl::getPage($request)); + $tok = strtok($info_string, "\n"); + $info = unserialize(base64_decode($tok)); if(isset($info[self::CRAWL_ORDER])) { $this->crawl_order = $info[self::CRAWL_ORDER]; } if(isset($info[self::SITES])) { - $this->to_crawl = $info[self::SITES]; + $this->to_crawl = array(); + while($tok !== false) { + $string = base64_decode($tok); + $tmp = unpack("f", substr($string, 0 , 4)); + $weight = $tmp[1]; + $tmp = unpack("N", substr($string, 4 , 4)); + $delay = $tmp[1]; + $url = substr($string, 8); + $this->to_crawl[] = array($url, $weight, $delay); + + $tok = strtok("\n"); + } } if(isset($info[self::SCHEDULE_TIME])) { @@ -645,8 +658,7 @@ class Fetcher implements CrawlConstants $site[self::URL]); if($doc_info) { - - $site[self::DOC_INFO] = $doc_info; + $site[self::DOC_INFO] = $doc_info; if(!is_dir(CRAWL_DIR."/cache")) { mkdir(CRAWL_DIR."/cache"); @@ -888,26 +900,95 @@ class Fetcher implements CrawlConstants if(count($this->to_crawl) <= 0) { - $this->found_sites[self::SCHEDULE_TIME] = $this->schedule_time; + $schedule_time = $this->schedule_time; } + /* + In what follows as we generate post data we delete stuff + from $this->found_sites, to try to minimize our memory + footprint. + */ + $bytes_to_send = 0; + $post_data = array('c'=>'fetch', 'a'=>'update', + 'crawl_time' => $this->crawl_time, 'machine_uri' => WEB_URI); + + //handle robots.txt data + if(isset($this->found_sites[self::ROBOT_TXT])) { + $post_data['robot_data'] = urlencode(base64_encode( + gzcompress(serialize($this->found_sites[self::ROBOT_TXT])))); + unset($this->found_sites[self::ROBOT_TXT]); + $bytes_to_send += strlen($post_data['robot_data']); + } + + //handle schedule data + $schedule_data = array(); + if(isset($this->found_sites[self::TO_CRAWL])) { + $schedule_data[self::TO_CRAWL] = & + $this->found_sites[self::TO_CRAWL]; + } + unset($this->found_sites[self::TO_CRAWL]); + if(isset($this->found_sites[self::SEEN_URLS]) && count($this->found_sites[self::SEEN_URLS]) > 0 ) { - $this->buildMiniInvertedIndex(); + $hash_seen_urls = array(); + $recent_urls = array(); + $cnt = 0; + foreach($this->found_sites[self::SEEN_URLS] as $site) { + $hash_seen_urls[] = + crawlHash($site[self::URL], true); + if(strpos($site[self::URL], "url|") !== 0) { + array_push($recent_urls, $site[self::URL]); + if($cnt >= NUM_RECENT_URLS_TO_DISPLAY) + { + array_shift($recent_urls); + } + $cnt++; + } + } + $schedule_data[self::HASH_SEEN_URLS] = & $hash_seen_urls; + unset($hash_seen_urls); + $schedule_data[self::RECENT_URLS] = & $recent_urls; + unset($recent_urls); } + if(!empty($schedule_data)) { + if(isset($schedule_time)) { + $schedule_data[self::SCHEDULE_TIME] = $schedule_time; + } + $post_data['schedule_data'] = urlencode(base64_encode( + gzcompress(serialize($schedule_data)))); + $bytes_to_send += strlen($post_data['schedule_data']); + } + unset($schedule_data); - $post_data = array('c'=>'fetch', 'a'=>'update', - 'crawl_time' => $this->crawl_time, 'machine_uri' => WEB_URI); - - $post_data['found'] = urlencode(base64_encode( - gzcompress(serialize($this->found_sites)))); - $bytes_to_send = strlen($post_data['found']); + //handle mini inverted index + if(isset($this->found_sites[self::SEEN_URLS]) && + count($this->found_sites[self::SEEN_URLS]) > 0 ) { + $this->buildMiniInvertedIndex(); + } + if(isset($this->found_sites[self::INVERTED_INDEX])) { + $index_data = array(); + $index_data[self::SEEN_URLS] = & + $this->found_sites[self::SEEN_URLS]; + unset($this->found_sites[self::SEEN_URLS]); + $index_data[self::INVERTED_INDEX] = & + $this->found_sites[self::INVERTED_INDEX]; + unset($this->found_sites[self::INVERTED_INDEX]); + $post_data['index_data'] = urlencode(base64_encode( + gzcompress(serialize($index_data)))); + unset($index_data); + $bytes_to_send += strlen($post_data['index_data']); + } $this->found_sites = array(); // reset found_sites so have more space. + if($bytes_to_send <= 0) { + crawlLog("No data to send aborting update scheduler..."); + return; + } + //try to send to queue server $sleep = false; do { - + if($sleep == true) { crawlLog("Trouble sending to the scheduler\n $info_string..."); sleep(5); @@ -927,7 +1008,10 @@ class Fetcher implements CrawlConstants $info = unserialize(trim($info_string)); crawlLog("Queue Server info response code: ".$info[self::STATUS]); crawlLog("Queue Server's crawl time is: ".$info[self::CRAWL_TIME]); - + crawlLog("Web Server peak memory usage: ". + $info[self::MEMORY_USAGE]); + crawlLog("This fetcher peak memory usage: ". + memory_get_peak_usage()); } while(!isset($info[self::STATUS]) || $info[self::STATUS] != self::CONTINUE_STATE); diff --git a/bin/queue_server.php b/bin/queue_server.php index 06882c111..f0e606f09 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -484,14 +484,21 @@ class QueueServer implements CrawlConstants $start_time = microtime(); $index_archive = $this->index_archive; - $sites = unserialize(file_get_contents($file)); - + $fh = fopen($file, "rb"); + $machine_string = fgets($fh); + $len = strlen($machine_string); + $machine_info = unserialize(base64_decode($machine_string)); + $sites = unserialize(gzuncompress(base64_decode( + urldecode(fread($fh, filesize($file) - $len)) + ))); + fclose($fh); + crawlLog("A memory usage".memory_get_usage() . " time: ".(changeInMicrotime($start_time))); $start_time = microtime(); - $machine = $sites[self::MACHINE]; - $machine_uri = $sites[self::MACHINE_URI]; + $machine = $machine_info[self::MACHINE]; + $machine_uri = $machine_info[self::MACHINE_URI]; if(isset($sites[self::SEEN_URLS]) && count($sites[self::SEEN_URLS]) > 0) { @@ -606,9 +613,15 @@ class QueueServer implements CrawlConstants crawlLog("Processing Robots data in $file"); $start_time = microtime(); - $sites = unserialize(file_get_contents($file)); + $fh = fopen($file, "rb"); + $machine_string = fgets($fh); + $len = strlen($machine_string); + unset($machine_string); + $sites = unserialize(gzuncompress(base64_decode( + urldecode(fread($fh, filesize($file) - $len)) + ))); + fclose($fh); - if(isset($sites)) { foreach($sites as $robot_host => $robot_info) { $this->web_queue->addGotRobotTxtFilter($robot_host); @@ -705,10 +718,20 @@ class QueueServer implements CrawlConstants { crawlLog("Processing File: $file"); - $sites = unserialize(file_get_contents($file)); - - if(isset($sites[self::MACHINE])) { - $this->most_recent_fetcher = $sites[self::MACHINE]; + $fh = fopen($file, "rb"); + $machine_string = fgets($fh); + $len = strlen($machine_string); + if($len > 0) { + $machine_info = unserialize(base64_decode($machine_string)); + } + $sites = unserialize(gzuncompress(base64_decode( + urldecode(fread($fh, filesize($file) - $len)) + ))); + fclose($fh); + + if(isset($machine_info[self::MACHINE])) { + $this->most_recent_fetcher = & $machine_info[self::MACHINE]; + unset($machine_info); } crawlLog("...Updating Delayed Hosts Array ..."); @@ -730,29 +753,19 @@ class QueueServer implements CrawlConstants $start_time = microtime(); $most_recent_urls = array(); - if(isset($sites[self::SEEN_URLS])) { + if(isset($sites[self::HASH_SEEN_URLS])) { $cnt = 0; - foreach($sites[self::SEEN_URLS] as $url) { - if($this->web_queue->containsUrlQueue($url)) { - crawlLog( - "Removing $url from Queue (shouldn't still be there!)"); - $this->web_queue->removeQueue($url); + foreach($sites[self::HASH_SEEN_URLS] as $hash_url) { + if($this->web_queue->lookupHashTable($hash_url)) { + crawlLog("Removing hash ". base64_encode($hash_url). + " from Queue"); + $this->web_queue->removeQueue($hash_url, true); } - if(strpos($url, "url|") !== 0) { - array_push($most_recent_urls, $url); - if($cnt >= NUM_RECENT_URLS_TO_DISPLAY) - { - array_shift($most_recent_urls); - } - $cnt++; - } - } } crawlLog(" time: ".(changeInMicrotime($start_time))); - crawlLog("... To Crawl ..."); $start_time = microtime(); if(isset($sites[self::TO_CRAWL])) { @@ -801,7 +814,7 @@ class QueueServer implements CrawlConstants } } - + crawlLog(" time: ".(changeInMicrotime($start_time))); crawlLog("C.."); @@ -817,7 +830,9 @@ class QueueServer implements CrawlConstants $crawl_status = array(); $crawl_status['MOST_RECENT_FETCHER'] = $this->most_recent_fetcher; - $crawl_status['MOST_RECENT_URLS_SEEN'] = $most_recent_urls; + if(isset($sites[self::RECENT_URLS])) { + $crawl_status['MOST_RECENT_URLS_SEEN'] = $sites[self::RECENT_URLS]; + } $crawl_status['CRAWL_TIME'] = $this->crawl_time; $info_bundle = IndexArchiveBundle::getArchiveInfo( CRAWL_DIR.'/cache/'.self::index_data_base_name.$this->crawl_time); @@ -835,9 +850,11 @@ class QueueServer implements CrawlConstants crawlLog("Number of unique pages so far: ". $info_bundle['VISITED_URLS_COUNT']); crawlLog("Total urls extracted so far: ".$info_bundle['COUNT']); + if(isset($sites[self::RECENT_URLS])) { crawlLog("Of these, the most recent urls are:"); - foreach($most_recent_urls as $url) { - crawlLog("URL: $url"); + foreach($sites[self::RECENT_URLS] as $url) { + crawlLog("URL: $url"); + } } } @@ -876,12 +893,15 @@ class QueueServer implements CrawlConstants $count = $this->web_queue->to_crawl_queue->count; + $sites = array(); $sites[self::CRAWL_TIME] = $this->crawl_time; $sites[self::SCHEDULE_TIME] = time(); $sites[self::SAVED_CRAWL_TIMES] = $this->getCrawlTimes(); // fetcher should delete any crawl time not listed here $sites[self::CRAWL_ORDER] = $this->crawl_order; $sites[self::SITES] = array(); + $first_line = base64_encode(serialize($sites))."\n"; + $delete_urls = array(); $crawl_delay_hosts = array(); @@ -958,7 +978,7 @@ class QueueServer implements CrawlConstants $delay = $this->web_queue->getCrawlDelay($host_url); $num_waiting = count($this->waiting_hosts); - + if($delay > 0 ) { // handle adding a url if there is a crawl delay if((!isset($this->waiting_hosts[crawlHash($host_url)]) @@ -1047,8 +1067,17 @@ class QueueServer implements CrawlConstants } ksort($sites[self::SITES]); - file_put_contents(CRAWL_DIR."/schedules/schedule.txt", - serialize($sites)); + //write schedule to disk + $fh = fopen(CRAWL_DIR."/schedules/schedule.txt", "wb"); + fwrite($fh, $first_line); + foreach($sites[self::SITES] as $site) { + list($url, $weight, $delay) = $site; + $out_string = base64_encode( + pack("f", $weight).pack("N", $delay).$url)."\n"; + fwrite($fh, $out_string); + } + fclose($fh); + crawlLog("End Produce Fetch Memory usage".memory_get_usage() ); crawlLog("Created fetch batch... Queue size is now ". $this->web_queue->to_crawl_queue->count. @@ -1068,10 +1097,10 @@ class QueueServer implements CrawlConstants * This function is used to schedule slots for crawl-delayed host. * * @param int $index location to begin searching for an empty slot - * @param array $arr list of slots to look in + * @param array &$arr list of slots to look in * @return int index of first available slot */ - function getEarliestSlot($index, $arr) + function getEarliestSlot($index, &$arr) { $cnt = count($arr); @@ -1130,6 +1159,7 @@ class QueueServer implements CrawlConstants function urlMemberSiteArray($url, $site_array) { $flag = false; + if(!is_array($site_array)) {return false;} foreach($site_array as $site) { $site_parts = mb_split("domain:", $site); if(isset($site_parts[1]) && diff --git a/configs/config.php b/configs/config.php index 84b798c4f..2ba10d5e7 100755 --- a/configs/config.php +++ b/configs/config.php @@ -164,9 +164,6 @@ define('NUM_WORDS_PER_GENERATION', 6*URL_FILTER_SIZE/NUM_INDEX_PARTITIONS); /** number of generations to sample in estimating number of urls in a query */ define('SAMPLE_GENERATIONS', 3); -/** store inlink data in word inverted index */ -define('STORE_INLINKS_IN_DICTIONARY', false); - /** precision to round floating points document scores */ define('PRECISION', 10); diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php index 4b58245d3..f556e9c3f 100755 --- a/controllers/admin_controller.php +++ b/controllers/admin_controller.php @@ -707,13 +707,13 @@ class AdminController extends Controller implements CrawlConstants CRAWL_DIR."/schedules/queue_server_messages.txt", $info_string); - $scheduler_info[self::SEEN_URLS] = array(); + $scheduler_info[self::HASH_SEEN_URLS] = array(); foreach ($seed_info['seed_sites']['url'] as $site) { $scheduler_info[self::TO_CRAWL][] = array($site, 1.0); } - $scheduler_info[self::ROBOT_TXT] = array(); - $scheduler_string = serialize($scheduler_info); + $scheduler_string = "\n".urlencode(base64_encode( + gzcompress(serialize($scheduler_info)))); @unlink(CRAWL_DIR."/schedules/schedule.txt"); file_put_contents( CRAWL_DIR."/schedules/ScheduleDataStartCrawl.txt", @@ -737,11 +737,19 @@ class AdminController extends Controller implements CrawlConstants case "resume": $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >". tl('admin_controller_resume_crawl')."</h1>')"; - + $seed_info = $this->crawlModel->getSeedInfo(); $info = array(); $info[self::STATUS] = "RESUME_CRAWL"; $info[self::CRAWL_TIME] = $this->clean($_REQUEST['timestamp'], "int"); + $info[self::CRAWL_ORDER] = + $seed_info['general']['crawl_order']; + $info[self::RESTRICT_SITES_BY_URL] = + $seed_info['general']['restrict_sites_by_url']; + $info[self::ALLOWED_SITES] = + $seed_info['allowed_sites']['url']; + $info[self::DISALLOWED_SITES] = + $seed_info['disallowed_sites']['url']; $info_string = serialize($info); file_put_contents( CRAWL_DIR."/schedules/queue_server_messages.txt", diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php index 04adb8cd3..9aef34649 100755 --- a/controllers/fetch_controller.php +++ b/controllers/fetch_controller.php @@ -103,7 +103,7 @@ class FetchController extends Controller implements CrawlConstants } else { $info = array(); $info[self::STATUS] = self::NO_DATA_STATE; - $data['MESSAGE'] = serialize($info); + $data['MESSAGE'] = base64_encode(serialize($info))."\n"; } $this->displayView($view, $data); @@ -116,22 +116,30 @@ class FetchController extends Controller implements CrawlConstants function update() { $view = "fetch"; - - if(isset($_REQUEST['found'])) { - $info =array(); - $sites = unserialize(gzuncompress( - base64_decode(urldecode($_REQUEST['found'])))); - - $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']); - $address = str_replace(":", "_", $address); - $time = time(); - $day = floor($time/86400); - - - $this->addRobotSchedules($sites, $address, $day, $time); - $this->addToCrawlSchedules($sites, $address, $day, $time); - $this->addToIndexSchedules($sites, $address, $day, $time); + $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']); + $address = str_replace(":", "_", $address); + $time = time(); + $day = floor($time/86400); + + $info_flag = false; + if(isset($_REQUEST['robot_data'])) { + $this->addScheduleToScheduleDirectory(self::robot_data_base_name, + $_REQUEST['robot_data']); + $info_flag = true; + } + if(isset($_REQUEST['schedule_data'])) { + $this->addScheduleToScheduleDirectory(self::schedule_data_base_name, + $_REQUEST['schedule_data']); + $info_flag = true; + } + if(isset($_REQUEST['index_data'])) { + $this->addScheduleToScheduleDirectory(self::index_data_base_name, + $_REQUEST['index_data']); + $info_flag = true; + } + if($info_flag == true) { + $info =array(); $info[self::STATUS] = self::CONTINUE_STATE; if(file_exists(CRAWL_DIR."/schedules/crawl_status.txt")) { $crawl_status = unserialize( @@ -139,8 +147,9 @@ class FetchController extends Controller implements CrawlConstants $info[self::CRAWL_TIME] = $crawl_status['CRAWL_TIME']; } else { $info[self::CRAWL_TIME] = 0; - } + } + $info[self::MEMORY_USAGE] = memory_get_peak_usage(); $data = array(); $data['MESSAGE'] = serialize($info); @@ -148,125 +157,23 @@ class FetchController extends Controller implements CrawlConstants } } - /** - * Adds a file containing the seen sites and inverted index from the - * just received $sites array to the schedules folder's index directory's - * subfolder for the current crawl time. This file is added in a sub folder - * $day and its name contains the $time at which it arrived and the ip - * $address from which it arrived. This file will then be process later - * by the queue server. - * - * @param &array $sites a list of seen sites and an inverted inverted index - * @param string $address the IP address of the sending machine with . -->_ - * @param string $day timestamp in seconds converted to days - * @param string $time timestamp in seconds - */ - function addToIndexSchedules(&$sites, $address, $day, $time) - { - if(isset($sites[self::SEEN_URLS])) { - $index_sites[self::SEEN_URLS] = $sites[self::SEEN_URLS]; - } - $sites[self::SEEN_URLS] = NULL; - - $index_sites[self::MACHINE] = $_SERVER['REMOTE_ADDR']; - $index_sites[self::MACHINE_URI] = $_REQUEST['machine_uri']; - if(isset($sites[self::INVERTED_INDEX])) { - $index_sites[self::INVERTED_INDEX] = $sites[self::INVERTED_INDEX]; - } - $index_dir = - CRAWL_DIR."/schedules/".self::index_data_base_name. - $_REQUEST['crawl_time']; - - $this->addScheduleToScheduleDirectory( - $index_dir, $index_sites, $address, $day, $time); - $sites[self::INVERTED_INDEX] = NULL; - } - - /** - * Adds a file containing the to-crawl sites from the just received - * $sites array to the schedules folder's schedule data directory's - * subfolder for the current crawl time. This file is added in a sub folder - * $day and its name contains the $time at which it arrived and the ip - * $address from which it arrived. This file will then be process later - * by the queue server. In addition to to-crawl sites the seen urls - * in $sites are also save in the file. They are used to perform a sanity - * check on the priority queue by the queue server. - * - * @param &array $sites a list of seen sites and to crawl sites - * @param string $address the IP address of the sending machine with . -->_ - * @param string $day timestamp in seconds converted to days - * @param string $time timestamp in seconds - */ - function addToCrawlSchedules(&$sites, $address, $day, $time) - { - $base_dir = CRAWL_DIR."/schedules/". - self::schedule_data_base_name.$_REQUEST['crawl_time']; - $scheduler_info = array(); - - if(isset($sites[self::TO_CRAWL])) { - $scheduler_info[self::TO_CRAWL] = $sites[self::TO_CRAWL]; - } - - $scheduler_info[self::MACHINE] = $_SERVER['REMOTE_ADDR']; - - if(isset($sites[self::SCHEDULE_TIME])) { - $scheduler_info[self::SCHEDULE_TIME] = $sites[self::SCHEDULE_TIME]; - } - - if(isset($sites[self::SEEN_URLS])) { - $seen_sites = $sites[self::SEEN_URLS]; - $num_seen = count($seen_sites); - - for($i = 0; $i < $num_seen; $i++) { - $scheduler_info[self::SEEN_URLS][$i] = - $seen_sites[$i][self::URL]; - } - } - $this->addScheduleToScheduleDirectory( - $base_dir, $scheduler_info, $address, $day, $time); - $sites[self::TO_CRAWL] = NULL; - } - - /** - * Adds a file containing the robot site data from the just received - * $sites array to the schedules folder's robot data directory's - * subfolder for the current crawl time. This file is added in a sub folder - * $day and its name contains the $time at which it arrived and the ip - * $address from which it arrived. This file will then be process later - * by the queue server. - * - * @param &array $sites a list of seen sites and an inverted inverted index - * @param string $address the IP address of the sending machine with . -->_ - * @param string $day timestamp in seconds converted to days - * @param string $time timestamp in seconds - */ - function addRobotSchedules(&$sites, $address, $day, $time) - { - $robot_dir = CRAWL_DIR."/schedules/". - self::robot_data_base_name.$_REQUEST['crawl_time']; - if(isset($sites[self::ROBOT_TXT])) { - $data = $sites[self::ROBOT_TXT]; - } else { - $data = array(); - } - $this->addScheduleToScheduleDirectory( - $robot_dir, $data, $address, $day, $time); - $sites[self::ROBOT_TXT] = NULL; - } - - /** * Adds a file with contents $data and with name containing $address and * $time to a subfolder $day of a folder $dir * - * @param string $dir directory in which to add the schedule file - * @param &array $data data that the schedule file is to contain - * @param string $address the IP address of the sending machine with . -->_ - * @param string $day timestamp in seconds converted to days - * @param string $time timestamp in seconds + * @param string $schedule_name the name of the kind of schedule being saved + * @param string &$data_string encoded, compressed, serialized data the + * schedule is to contain */ - function addScheduleToScheduleDirectory($dir, &$data, $address, $day, $time) + function addScheduleToScheduleDirectory($schedule_name, &$data_string) { + $dir = CRAWL_DIR."/schedules/".$schedule_name.$_REQUEST['crawl_time']; + + $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']); + $address = str_replace(":", "_", $address); + $time = time(); + $day = floor($time/86400); + if(!file_exists($dir)) { mkdir($dir); chmod($dir, 0777); @@ -277,12 +184,18 @@ class FetchController extends Controller implements CrawlConstants mkdir($dir); chmod($dir, 0777); } - - $data_string = serialize($data); + $machine_data = array(); + $machine_data[self::MACHINE] = $_SERVER['REMOTE_ADDR']; + $machine_data[self::MACHINE_URI] = $_REQUEST['machine_uri']; + $machine_string = base64_encode(serialize($machine_data))."\n"; + $data_hash = crawlHash($data_string); - file_put_contents( - $dir."/At".$time."From".$address. - "WithHash$data_hash.txt", $data_string); + $fh = fopen($dir."/At".$time."From".$address. + "WithHash$data_hash.txt", "wb"); + fwrite($fh, $machine_string); + fwrite($fh, $data_string); + fclose($fh); + } /** diff --git a/index.php b/index.php index bf0120a1d..c94b06887 100755 --- a/index.php +++ b/index.php @@ -42,7 +42,7 @@ define("BASE_DIR", substr($_SERVER['SCRIPT_FILENAME'], 0,-strlen("index.php"))); * Load the configuration file */ require_once(BASE_DIR.'configs/config.php'); -ini_set("memory_limit","450M"); +ini_set("memory_limit","500M"); header("X-FRAME-OPTIONS: DENY"); //prevent click jacking session_name(SESSION_NAME); session_start(); diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index 0a2dcf79e..12a98b732 100644 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -128,5 +128,8 @@ interface CrawlConstants const FILETYPE = 'ag'; const SUMMARY = 'ah'; const URL_INFO = 'ai'; + const HASH_SEEN_URLS ='aj'; + const RECENT_URLS ='ak'; + const MEMORY_USAGE ='al'; } ?> diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index f048e4128..72f90010c 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -429,7 +429,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants $slice_cnt--; } if($min_common !== NULL) { - $out_data[0][$word_key .":". $min_common][self::POINT_BLOCK] = 0; + $out_data[ + 0][$word_key .":". $min_common][self::POINT_BLOCK] = 0; // this index needs to point to previous block with word } diff --git a/lib/persistent_structure.php b/lib/persistent_structure.php index 40594e750..a471a0d4a 100755 --- a/lib/persistent_structure.php +++ b/lib/persistent_structure.php @@ -48,7 +48,8 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} class PersistentStructure { - /** If not specified in the constructor, this will be the number of operations between saves + /** If not specified in the constructor, this will be the number of + * operations between saves * @var int */ const DEFAULT_SAVE_FREQUENCY = 50000; @@ -72,7 +73,7 @@ class PersistentStructure * @param string $fname the name of the file to store the * PersistentStructure in * @param int $save_frequency the number of operation before a save If - * <= 0 never save + * <= 0 never check save */ public function __construct($fname, $save_frequency = self::DEFAULT_SAVE_FREQUENCY) @@ -94,7 +95,9 @@ class PersistentStructure } /** - * Save the PersistentStructure to its filename + * Save the PersistentStructure to its filename + * This method is generic but super memory inefficient, so reimplement + * for subclasses is needed */ public function save() { diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php index 967859823..22d2d9008 100755 --- a/lib/phrase_parser.php +++ b/lib/phrase_parser.php @@ -63,9 +63,10 @@ class PhraseParser */ static function extractWordStringPageSummary($page) { - $title_phrase_string = mb_ereg_replace("[[:punct:]]", " ", + $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|"; + $title_phrase_string = mb_ereg_replace($punct, " ", $page[CrawlConstants::TITLE]); - $description_phrase_string = mb_ereg_replace("[[:punct:]]", " ", + $description_phrase_string = mb_ereg_replace($punct, " ", $page[CrawlConstants::DESCRIPTION]); $page_string = $title_phrase_string . " " . $description_phrase_string; @@ -132,7 +133,8 @@ class PhraseParser static function extractPhrasesOfLengthOffset($string, $phrase_len, $offset) { - $words = mb_split("[[:space:]]", $string); + $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|"; + $words = mb_split("[[:space:]]|".$punct, $string); $stems = array(); diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php index 8e768c705..a3fae469c 100755 --- a/lib/processors/text_processor.php +++ b/lib/processors/text_processor.php @@ -119,9 +119,12 @@ class TextProcessor implements CrawlConstants '@((http|https)://([^ \t\r\n\v\f\'\"\;\,\<\>\[\]\{\}\(\)])*)@i'; $sites = array(); preg_match_all($pattern, $page, $matches); + $i = 0; foreach($matches[0] as $url) { if(!isset($sites[$url])) { $sites[$url] = strip_tags($url); + $i++; + if($i >= MAX_LINKS_PER_PAGE) {break;} } } return $sites; diff --git a/lib/string_array.php b/lib/string_array.php index 3907eb805..831ea4dd8 100755 --- a/lib/string_array.php +++ b/lib/string_array.php @@ -65,7 +65,7 @@ class StringArray extends PersistentStructure * Number of bytes of storage need by the string array * @var int */ - var $array_size; + var $string_array_size; /** * Character string used to store the packed data of the StringArray * @var string @@ -98,6 +98,39 @@ class StringArray extends PersistentStructure } + /** + * Load a StringArray from a file + * + * @param string the name of the file to load the StringArray from + * @return object the PersistentStructure loaded + */ + public static function load($fname) + { + $fh = fopen($fname, "rb"); + $tmp = unpack("N", fread($fh, 4)); + $array_size = $tmp[1]; + $array = fread($fh, $array_size); + $object = unserialize(fread($fh, + filesize($fname) -4 - $array_size)); + $object->string_array = & $array; + fclose($fh); + return $object; + } + + /** + * Save the StringArray to its filename + */ + public function save() + { + $fh = fopen($this->filename, "wb"); + $tmp = & $this->string_array; + fwrite($fh, pack("N", $this->string_array_size)); + fwrite($fh, $this->string_array); + unset($this->string_array); + fwrite($fh, serialize($this)); + $this->string_array = & $tmp; + fclose($fh); + } /** * Looks up the ith item in the StringArray diff --git a/lib/web_queue_bundle.php b/lib/web_queue_bundle.php index efb06bd13..5b257940b 100755 --- a/lib/web_queue_bundle.php +++ b/lib/web_queue_bundle.php @@ -348,11 +348,16 @@ class WebQueueBundle implements Notifier * url is scheduled to be crawled. It only deletes the item from * the bundles priority queue and hash table -- not from the web archive. * - * @param string $url the url to delete + * @param string $url the url or hash of url to delete + * @param bool $isHash flag to say whether or not is the hash of a url */ - function removeQueue($url) + function removeQueue($url, $isHash = false) { - $hash_url = crawlHash($url, true); + if($isHash == true) { + $hash_url = $url; + } else { + $hash_url = crawlHash($url, true); + } $data = $this->lookupHashTable($hash_url); if(!$data) { @@ -653,7 +658,7 @@ class WebQueueBundle implements Notifier * The problem is as the table gets reused a lot, it tends to fill up * with a lot of deleted entries making lookup times get more and more * linear in the hash table size. By rebuilding the table we mitigate - * against this problem. By choosing the rebuild frequecy appropriately, + * against this problem. By choosing the rebuild frequency appropriately, * the amortized cost of this operation is only O(1). */ function rebuildHashTable() diff --git a/models/phrase_model.php b/models/phrase_model.php index 84b732abb..e6d2d6db0 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -198,8 +198,8 @@ class PhraseModel extends Model $index_archive_name = self::index_data_base_name . $index_name; $index_archive = new IndexArchiveBundle( CRAWL_DIR.'/cache/'.$index_archive_name); - - $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $phrase_string); + $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|"; + $phrase_string = mb_ereg_replace($punct, " ", $phrase_string); $phrase_string = preg_replace("/(\s)+/", " ", $phrase_string); /*