Bumping memory requirements of fetcher, a=chris

Chris Pollett [2012-05-05 22:May:th]
Bumping memory requirements of fetcher, a=chris
Filename
bin/fetcher.php
lib/phrase_parser.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index b87b51f16..08c4bea72 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -42,7 +42,7 @@ define("BASE_DIR", substr(
     dirname(realpath($_SERVER['PHP_SELF'])), 0,
     -strlen("/bin")));

-ini_set("memory_limit","850M"); //so have enough memory to crawl big pages
+ini_set("memory_limit","1000M"); //so have enough memory to crawl big pages

 /** Load in global configuration settings */
 require_once BASE_DIR.'/configs/config.php';
@@ -1649,7 +1649,7 @@ class Fetcher implements CrawlConstants
         global $IMAGE_TYPES;

         $start_time = microtime();
-
+        crawlLog("  Start building mini inverted index ... ");
         $num_seen = count($this->found_sites[self::SEEN_URLS]);
         $this->num_seen_sites += $num_seen;
         /*
@@ -1760,10 +1760,10 @@ class Fetcher implements CrawlConstants
                     $this->found_sites[self::SEEN_URLS][] = $summary;
                     $link_type = UrlParser::getDocumentType($url);
                     if(in_array($link_type, $IMAGE_TYPES)) {
+                        $link_meta_ids[] = "media:image";
                         if(isset($safe) && !$safe) {
                             $link_meta_ids[] = "safe:false";
                         }
-                        $link_meta_ids[] = "media:image";
                     } else if(UrlParser::isVideoUrl($url)) {
                         $link_meta_ids[] = "media:video";
                         if(isset($safe) && !$safe) {
diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index b3c00bcf8..26962f469 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -429,7 +429,7 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け  二形 ふたなりゴッ
 雞巴 鷄巴 雞雞 鷄鷄 阴茎 陰莖 胯下物
 屌 吊 小鳥 龟头 龜頭 屄 鸡白 雞白 傻屄 老二 那话儿 那話兒 屄 鸡白 雞白 阴道 陰道
 阴户 陰戶 大姨妈 淫蟲 老嫖 妓女 臭婊子 卖豆腐 賣豆腐 咪咪 大豆腐 爆乳 肏操
-炒饭 炒飯 cặc lồn kaltak orospu siktir sıçmak amcık ";
+炒饭 炒飯 cặc lồn kaltak orospu siktir sıçmak amcık";
         static $unsafe_terms = array();

         if($unsafe_terms == array()) {
ViewGit