diff --git a/bin/fetcher.php b/bin/fetcher.php
index 90eb49bb1..9d76c66e9 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -42,7 +42,7 @@ define("BASE_DIR", substr(
dirname(realpath($_SERVER['PHP_SELF'])), 0,
-strlen("/bin")));
-ini_set("memory_limit","1100M"); //so have enough memory to crawl big pages
+ini_set("memory_limit","850M"); //so have enough memory to crawl sitemaps
/** Load in global configuration settings */
require_once BASE_DIR.'/configs/config.php';
@@ -964,7 +964,7 @@ class Fetcher implements CrawlConstants
function processFetchPages($site_pages)
{
$PAGE_PROCESSORS = $this->page_processors;
- crawlLog(" Start process pages...");
+ crawlLog(" Start process pages... Current Memory:".memory_get_usage());
$start_time = microtime();
$prefix = $this->fetcher_num."-";
@@ -1187,7 +1187,8 @@ class Fetcher implements CrawlConstants
$cache_page_partition;
}
}
- crawlLog(" Process pages time".(changeInMicrotime($start_time)));
+ crawlLog(" Process pages time".(changeInMicrotime($start_time)).
+ " Current Memory:".memory_get_usage());
return $summarized_site_pages;
}
@@ -1554,12 +1555,10 @@ class Fetcher implements CrawlConstants
$bytes_to_send += strlen($post_data['schedule_data']);
}
unset($schedule_data);
-
//handle mini inverted index
if($seen_cnt > 0 ) {
$this->buildMiniInvertedIndex();
}
- crawlLog("...");
if(isset($this->found_sites[self::INVERTED_INDEX][
$this->current_server])) {
$compress_urls = "";
@@ -1571,13 +1570,16 @@ class Fetcher implements CrawlConstants
unset($this->found_sites[self::SEEN_URLS]);
$len_urls = strlen($compress_urls);
crawlLog("...Finish Compressing seen URLs.");
- $post_data['index_data'] = webencode( packInt($len_urls).
- $compress_urls. $this->found_sites[self::INVERTED_INDEX][
- $this->current_server]
- ); // don't compress index data
+ $out_string = packInt($len_urls). $compress_urls;
unset($compress_urls);
+ $out_string .= $this->found_sites[self::INVERTED_INDEX][
+ $this->current_server];
unset($this->found_sites[self::INVERTED_INDEX][
$this->current_server]);
+ gc_collect_cycles();
+ $post_data['index_data'] = webencode($out_string);
+ // don't compress index data
+ unset($out_string);
$bytes_to_send += strlen($post_data['index_data']);
}
@@ -1621,6 +1623,7 @@ class Fetcher implements CrawlConstants
memory_get_peak_usage());
} while(!isset($info[self::STATUS]) ||
$info[self::STATUS] != self::CONTINUE_STATE);
+ crawlLog("... Current Memory G:".memory_get_usage());
if($this->crawl_type == self::WEB_CRAWL) {
$dir = CRAWL_DIR."/schedules";
file_put_contents("$dir/$prefix".self::fetch_batch_name.
@@ -1649,7 +1652,8 @@ class Fetcher implements CrawlConstants
global $IMAGE_TYPES;
$start_time = microtime();
- crawlLog(" Start building mini inverted index ... ");
+ crawlLog(" Start building mini inverted index ... Current Memory:".
+ memory_get_usage());
$num_seen = count($this->found_sites[self::SEEN_URLS]);
$this->num_seen_sites += $num_seen;
/*
@@ -1657,9 +1661,10 @@ class Fetcher implements CrawlConstants
name doesn't matter.
*/
if(!isset($this->found_sites[self::INVERTED_INDEX][
- $this->current_server]))
+ $this->current_server])) {
$this->found_sites[self::INVERTED_INDEX][$this->current_server] =
new IndexShard("fetcher_shard_{$this->current_server}");
+ }
for($i = 0; $i < $num_seen; $i++) {
$site = $this->found_sites[self::SEEN_URLS][$i];
if(!isset($site[self::HASH])) {continue; }
@@ -1765,16 +1770,9 @@ class Fetcher implements CrawlConstants
if(isset($safe) && !$safe) {
$link_meta_ids[] = "safe:false";
}
- } else if(UrlParser::isVideoUrl($url)) {
- $link_meta_ids[] = "media:video";
- if(isset($safe) && !$safe) {
- $link_meta_ids[] = "safe:false";
- }
} else {
$link_meta_ids[] = "media:text";
}
- $link_text =
- mb_ereg_replace(PUNCT, " ", $link_text);
$link_word_lists =
PhraseParser::extractPhrasesInLists($link_text,
$lang, true);
diff --git a/bin/queue_server.php b/bin/queue_server.php
index af2eac34c..d131f09ad 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -593,9 +593,10 @@ class QueueServer implements CrawlConstants, Join
function &getDataArchiveFileData($file)
{
crawlLog("Processing File: $file");
-
- $sites = unserialize(gzuncompress(webdecode(file_get_contents($file) ))
- );
+ $decode = file_get_contents($file);
+ $decode = webdecode($decode);
+ $decode = gzuncompress($decode);
+ $sites = unserialize($decode);
return $sites;
}
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index fc15ca5e3..8d4be83de 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -506,6 +506,7 @@ class FetchUrl implements CrawlConstants
}
crawlLog("Set curl options");
$response = curl_exec($agents[$host]);
+ curl_setopt($agents[$host], CURLOPT_POSTFIELDS, "");
crawlLog("Done curl exec");
return $response;
}
diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index 3926b3c40..eecbf9d18 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -180,44 +180,52 @@ class PhraseParser
*/
static function canonicalizePunctuatedTerms(&$string, $lang = NULL)
{
-
+ //these obscure statics is because php 5.2 does not garbage collect
+ //create_function's
+ static $replace_function0, $replace_function1, $replace_function2;
$acronym_pattern = "/[A-Za-z]\.(\s*[A-Za-z]\.)+/";
- $replace_function = create_function('$matches', '
- $result = "_".mb_strtolower(
- mb_ereg_replace("\.", "", $matches[0]));
- return $result;');
+ if(!isset($replace_function0)) {
+ $replace_function0 = create_function('$matches', '
+ $result = "_".mb_strtolower(
+ mb_ereg_replace("\.", "", $matches[0]));
+ return $result;');
+ }
$string = preg_replace_callback($acronym_pattern,
- $replace_function, $string);
+ $replace_function0, $string);
$ampersand_pattern = "/[A-Za-z]+(\s*(\s(\'n|\'N)\s|\&)\s*[A-Za-z])+/";
- $replace_function = create_function('$matches', '
- $result = mb_strtolower(
- mb_ereg_replace("\s*(\'n|\'N|\&)\s*", "_and_",$matches[0]));
- return $result;
- ');
- $string = preg_replace_callback($ampersand_pattern, $replace_function,
+ if(!isset($replace_function1)) {
+ $replace_function1 = create_function('$matches', '
+ $result = mb_strtolower(
+ mb_ereg_replace("\s*(\'n|\'N|\&)\s*", "_and_",$matches[0]));
+ return $result;
+ ');
+ }
+ $string = preg_replace_callback($ampersand_pattern,$replace_function1,
$string);
$url_or_email_pattern =
'@((http|https)://([^ \t\r\n\v\f\'\"\;\,<>])*)|'.
'([A-Z0-9._%-]+\@[A-Z0-9.-]+\.[A-Z]{2,4})@i';
- $replace_function = create_function('$matches', '
- $result = mb_ereg_replace("\.", "_d_",$matches[0]);
- $result = mb_ereg_replace("\:", "_c_",$result);
- $result = mb_ereg_replace("\/", "_s_",$result);
- $result = mb_ereg_replace("\@", "_a_",$result);
- $result = mb_ereg_replace("\[", "_bo_",$result);
- $result = mb_ereg_replace("\]", "_bc_",$result);
- $result = mb_ereg_replace("\(", "_po_",$result);
- $result = mb_ereg_replace("\)", "_pc_",$result);
- $result = mb_ereg_replace("\?", "_q_",$result);
- $result = mb_ereg_replace("\=", "_e_",$result);
- $result = mb_ereg_replace("\&", "_a_",$result);
- $result = mb_strtolower($result);
- return $result;
- ');
+ if(!isset($replace_function2)) {
+ $replace_function2 = create_function('$matches', '
+ $result = mb_ereg_replace("\.", "_d_",$matches[0]);
+ $result = mb_ereg_replace("\:", "_c_",$result);
+ $result = mb_ereg_replace("\/", "_s_",$result);
+ $result = mb_ereg_replace("\@", "_a_",$result);
+ $result = mb_ereg_replace("\[", "_bo_",$result);
+ $result = mb_ereg_replace("\]", "_bc_",$result);
+ $result = mb_ereg_replace("\(", "_po_",$result);
+ $result = mb_ereg_replace("\)", "_pc_",$result);
+ $result = mb_ereg_replace("\?", "_q_",$result);
+ $result = mb_ereg_replace("\=", "_e_",$result);
+ $result = mb_ereg_replace("\&", "_a_",$result);
+ $result = mb_strtolower($result);
+ return $result;
+ ');
+ }
$string = preg_replace_callback($url_or_email_pattern,
- $replace_function, $string);
+ $replace_function2, $string);
}
/**
@@ -409,7 +417,7 @@ class PhraseParser
* @param int $len length of text being examined in characters
* @return int $score of how explicit document is
*/
- static function computeSafeSearchScore($word_lists, $len)
+ static function computeSafeSearchScore(&$word_lists, $len)
{
static $unsafe_phrase = "
XXX sex slut nymphomaniac MILF lolita lesbian sadomasochism
@@ -433,6 +441,10 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け 二形 ふたなりゴッ
炒饭 炒飯 cặc lồn kaltak orospu siktir sıçmak amcık";
static $unsafe_terms = array();
+ if(count($word_lists) == 0) {
+ return 0;
+ }
+
if($unsafe_terms == array()) {
$unsafe_lists = PhraseParser::extractPhrasesInLists($unsafe_phrase,
"en-US", true);
diff --git a/lib/url_parser.php b/lib/url_parser.php
index 89b8ab078..b74903455 100755
--- a/lib/url_parser.php
+++ b/lib/url_parser.php
@@ -705,13 +705,13 @@ class UrlParser
* Checks if a URL corresponds to a known playback page of a video
* sharing site
*
- * @param string &$url the url to check
+ * @param string $url the url to check
* @return bool whether or not corresponds to video playback page of a known
* video site
*/
static function isVideoUrl(&$url)
{
- $video_prefixes = array("http://www.youtube.com/watch?v=",
+ static $video_prefixes = array("http://www.youtube.com/watch?v=",
"http://www.metacafe.com/watch/",
"http://screen.yahoo.com/",
"http://player.vimeo.com/video/",
@@ -719,9 +719,19 @@ class UrlParser
"http://www.dailymotion.com/video/",
"http://v.youku.com/v_playlist/",
"http://www.break.com/index/");
- foreach($video_prefixes as $prefix) {
- $quoted = preg_quote($prefix, "/");
- $pattern = "/$quoted/";
+ static $patterns = array();
+
+ if(strlen($url) <= 0 ) {
+ return false;
+ }
+ if($patterns == array()) {
+ foreach($video_prefixes as $prefix) {
+ $quoted = preg_quote($prefix, "/");
+ $patterns[] = "/$quoted/";
+ }
+ }
+
+ foreach($patterns as $pattern) {
if(preg_match($pattern, $url) > 0) {
return true;
}
diff --git a/lib/utility.php b/lib/utility.php
index 7a2a5b80b..b1c24e8af 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -489,7 +489,8 @@ function unbase64Hash($base64)
*/
function webencode($str)
{
- $str = str_replace("/", "_", base64_encode($str));
+ $str = base64_encode($str);
+ $str = str_replace("/", "_", $str);
$str = str_replace("+", ".", $str);
$str = str_replace("=", "~", $str);
return $str;
@@ -505,7 +506,8 @@ function webdecode($str)
{
$str = str_replace("_", "/", $str);
$str = str_replace(".", "+", $str);
- return base64_decode(str_replace("~", "=", $str));
+ $str = str_replace("~", "=", $str);
+ return base64_decode($str);
}
/**