Fixes memory leak introduced by using create_function to regain 5.2 compatibility for cpanel environments with 5.2, a=chris

Chris Pollett [2012-05-07 05:May:th]
Fixes memory leak introduced by using create_function to regain 5.2 compatibility for cpanel environments with 5.2, a=chris
Filename
bin/fetcher.php
bin/queue_server.php
lib/fetch_url.php
lib/phrase_parser.php
lib/url_parser.php
lib/utility.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 90eb49bb1..9d76c66e9 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -42,7 +42,7 @@ define("BASE_DIR", substr(
     dirname(realpath($_SERVER['PHP_SELF'])), 0,
     -strlen("/bin")));

-ini_set("memory_limit","1100M"); //so have enough memory to crawl big pages
+ini_set("memory_limit","850M"); //so have enough memory to crawl sitemaps

 /** Load in global configuration settings */
 require_once BASE_DIR.'/configs/config.php';
@@ -964,7 +964,7 @@ class Fetcher implements CrawlConstants
     function processFetchPages($site_pages)
     {
         $PAGE_PROCESSORS = $this->page_processors;
-        crawlLog("  Start process pages...");
+        crawlLog("  Start process pages... Current Memory:".memory_get_usage());
         $start_time = microtime();

         $prefix = $this->fetcher_num."-";
@@ -1187,7 +1187,8 @@ class Fetcher implements CrawlConstants
                     $cache_page_partition;
             }
         }
-        crawlLog("  Process pages time".(changeInMicrotime($start_time)));
+        crawlLog("  Process pages time".(changeInMicrotime($start_time)).
+             " Current Memory:".memory_get_usage());

         return $summarized_site_pages;
     }
@@ -1554,12 +1555,10 @@ class Fetcher implements CrawlConstants
             $bytes_to_send += strlen($post_data['schedule_data']);
         }
         unset($schedule_data);
-
         //handle mini inverted index
         if($seen_cnt > 0 ) {
             $this->buildMiniInvertedIndex();
         }
-        crawlLog("...");
         if(isset($this->found_sites[self::INVERTED_INDEX][
             $this->current_server])) {
             $compress_urls = "";
@@ -1571,13 +1570,16 @@ class Fetcher implements CrawlConstants
             unset($this->found_sites[self::SEEN_URLS]);
             $len_urls =  strlen($compress_urls);
             crawlLog("...Finish Compressing seen URLs.");
-            $post_data['index_data'] = webencode( packInt($len_urls).
-                $compress_urls. $this->found_sites[self::INVERTED_INDEX][
-                $this->current_server]
-                ); // don't compress index data
+            $out_string = packInt($len_urls). $compress_urls;
             unset($compress_urls);
+            $out_string .= $this->found_sites[self::INVERTED_INDEX][
+                $this->current_server];
             unset($this->found_sites[self::INVERTED_INDEX][
                 $this->current_server]);
+            gc_collect_cycles();
+            $post_data['index_data'] = webencode($out_string);
+                // don't compress index data
+            unset($out_string);
             $bytes_to_send += strlen($post_data['index_data']);
         }

@@ -1621,6 +1623,7 @@ class Fetcher implements CrawlConstants
                 memory_get_peak_usage());
         } while(!isset($info[self::STATUS]) ||
             $info[self::STATUS] != self::CONTINUE_STATE);
+        crawlLog("...  Current Memory G:".memory_get_usage());
         if($this->crawl_type == self::WEB_CRAWL) {
             $dir = CRAWL_DIR."/schedules";
             file_put_contents("$dir/$prefix".self::fetch_batch_name.
@@ -1649,7 +1652,8 @@ class Fetcher implements CrawlConstants
         global $IMAGE_TYPES;

         $start_time = microtime();
-        crawlLog("  Start building mini inverted index ... ");
+        crawlLog("  Start building mini inverted index ...  Current Memory:".
+            memory_get_usage());
         $num_seen = count($this->found_sites[self::SEEN_URLS]);
         $this->num_seen_sites += $num_seen;
         /*
@@ -1657,9 +1661,10 @@ class Fetcher implements CrawlConstants
             name doesn't matter.
         */
         if(!isset($this->found_sites[self::INVERTED_INDEX][
-            $this->current_server]))
+            $this->current_server])) {
             $this->found_sites[self::INVERTED_INDEX][$this->current_server] =
                 new IndexShard("fetcher_shard_{$this->current_server}");
+        }
         for($i = 0; $i < $num_seen; $i++) {
             $site = $this->found_sites[self::SEEN_URLS][$i];
             if(!isset($site[self::HASH])) {continue; }
@@ -1765,16 +1770,9 @@ class Fetcher implements CrawlConstants
                         if(isset($safe) && !$safe) {
                             $link_meta_ids[] = "safe:false";
                         }
-                    } else if(UrlParser::isVideoUrl($url)) {
-                        $link_meta_ids[] = "media:video";
-                        if(isset($safe) && !$safe) {
-                            $link_meta_ids[] = "safe:false";
-                        }
                     } else {
                         $link_meta_ids[] = "media:text";
                     }
-                    $link_text =
-                        mb_ereg_replace(PUNCT, " ", $link_text);
                     $link_word_lists =
                         PhraseParser::extractPhrasesInLists($link_text,
                         $lang, true);
diff --git a/bin/queue_server.php b/bin/queue_server.php
index af2eac34c..d131f09ad 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -593,9 +593,10 @@ class QueueServer implements CrawlConstants, Join
     function &getDataArchiveFileData($file)
     {
         crawlLog("Processing File: $file");
-
-        $sites = unserialize(gzuncompress(webdecode(file_get_contents($file) ))
-            );
+        $decode = file_get_contents($file);
+        $decode = webdecode($decode);
+        $decode = gzuncompress($decode);
+        $sites = unserialize($decode);

         return $sites;
     }
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index fc15ca5e3..8d4be83de 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -506,6 +506,7 @@ class FetchUrl implements CrawlConstants
         }
         crawlLog("Set curl options");
         $response = curl_exec($agents[$host]);
+        curl_setopt($agents[$host], CURLOPT_POSTFIELDS, "");
         crawlLog("Done curl exec");
         return $response;
     }
diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index 3926b3c40..eecbf9d18 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -180,44 +180,52 @@ class PhraseParser
      */
     static function canonicalizePunctuatedTerms(&$string, $lang = NULL)
     {
-
+        //these obscure statics is because php 5.2 does not garbage collect
+        //create_function's
+        static $replace_function0, $replace_function1, $replace_function2;
         $acronym_pattern = "/[A-Za-z]\.(\s*[A-Za-z]\.)+/";
-        $replace_function = create_function('$matches', '
-            $result = "_".mb_strtolower(
-                mb_ereg_replace("\.", "", $matches[0]));
-            return $result;');
+        if(!isset($replace_function0)) {
+            $replace_function0 = create_function('$matches', '
+                $result = "_".mb_strtolower(
+                    mb_ereg_replace("\.", "", $matches[0]));
+                return $result;');
+        }
         $string = preg_replace_callback($acronym_pattern,
-            $replace_function, $string);
+            $replace_function0, $string);

         $ampersand_pattern = "/[A-Za-z]+(\s*(\s(\'n|\'N)\s|\&)\s*[A-Za-z])+/";
-        $replace_function = create_function('$matches', '
-            $result = mb_strtolower(
-                mb_ereg_replace("\s*(\'n|\'N|\&)\s*", "_and_",$matches[0]));
-            return $result;
-        ');
-        $string = preg_replace_callback($ampersand_pattern, $replace_function,
+        if(!isset($replace_function1)) {
+            $replace_function1 = create_function('$matches', '
+                $result = mb_strtolower(
+                    mb_ereg_replace("\s*(\'n|\'N|\&)\s*", "_and_",$matches[0]));
+                return $result;
+            ');
+        }
+        $string = preg_replace_callback($ampersand_pattern,$replace_function1,
             $string);

         $url_or_email_pattern =
             '@((http|https)://([^ \t\r\n\v\f\'\"\;\,<>])*)|'.
             '([A-Z0-9._%-]+\@[A-Z0-9.-]+\.[A-Z]{2,4})@i';
-        $replace_function = create_function('$matches', '
-            $result =  mb_ereg_replace("\.", "_d_",$matches[0]);
-            $result =  mb_ereg_replace("\:", "_c_",$result);
-            $result =  mb_ereg_replace("\/", "_s_",$result);
-            $result =  mb_ereg_replace("\@", "_a_",$result);
-            $result =  mb_ereg_replace("\[", "_bo_",$result);
-            $result =  mb_ereg_replace("\]", "_bc_",$result);
-            $result =  mb_ereg_replace("\(", "_po_",$result);
-            $result =  mb_ereg_replace("\)", "_pc_",$result);
-            $result =  mb_ereg_replace("\?", "_q_",$result);
-            $result =  mb_ereg_replace("\=", "_e_",$result);
-            $result =  mb_ereg_replace("\&", "_a_",$result);
-            $result = mb_strtolower($result);
-            return $result;
-        ');
+        if(!isset($replace_function2)) {
+            $replace_function2 = create_function('$matches', '
+                $result =  mb_ereg_replace("\.", "_d_",$matches[0]);
+                $result =  mb_ereg_replace("\:", "_c_",$result);
+                $result =  mb_ereg_replace("\/", "_s_",$result);
+                $result =  mb_ereg_replace("\@", "_a_",$result);
+                $result =  mb_ereg_replace("\[", "_bo_",$result);
+                $result =  mb_ereg_replace("\]", "_bc_",$result);
+                $result =  mb_ereg_replace("\(", "_po_",$result);
+                $result =  mb_ereg_replace("\)", "_pc_",$result);
+                $result =  mb_ereg_replace("\?", "_q_",$result);
+                $result =  mb_ereg_replace("\=", "_e_",$result);
+                $result =  mb_ereg_replace("\&", "_a_",$result);
+                $result = mb_strtolower($result);
+                return $result;
+            ');
+        }
         $string = preg_replace_callback($url_or_email_pattern,
-            $replace_function, $string);
+            $replace_function2, $string);
     }

     /**
@@ -409,7 +417,7 @@ class PhraseParser
      *  @param int $len length of text being examined in characters
      *  @return int $score of how explicit document is
      */
-    static function computeSafeSearchScore($word_lists, $len)
+    static function computeSafeSearchScore(&$word_lists, $len)
     {
         static $unsafe_phrase = "
 XXX sex slut nymphomaniac MILF lolita lesbian sadomasochism
@@ -433,6 +441,10 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け  二形 ふたなりゴッ
 炒饭 炒飯 cặc lồn kaltak orospu siktir sıçmak amcık";
         static $unsafe_terms = array();

+        if(count($word_lists) == 0) {
+            return 0;
+        }
+
         if($unsafe_terms == array()) {
             $unsafe_lists = PhraseParser::extractPhrasesInLists($unsafe_phrase,
                 "en-US", true);
diff --git a/lib/url_parser.php b/lib/url_parser.php
index 89b8ab078..b74903455 100755
--- a/lib/url_parser.php
+++ b/lib/url_parser.php
@@ -705,13 +705,13 @@ class UrlParser
      * Checks if a URL corresponds to a known playback page of a video
      * sharing site
      *
-     * @param string &$url the url to check
+     * @param string $url the url to check
      * @return bool whether or not corresponds to video playback page of a known
      *      video site
      */
     static function isVideoUrl(&$url)
     {
-        $video_prefixes = array("http://www.youtube.com/watch?v=",
+        static $video_prefixes = array("http://www.youtube.com/watch?v=",
             "http://www.metacafe.com/watch/",
             "http://screen.yahoo.com/",
             "http://player.vimeo.com/video/",
@@ -719,9 +719,19 @@ class UrlParser
             "http://www.dailymotion.com/video/",
             "http://v.youku.com/v_playlist/",
             "http://www.break.com/index/");
-        foreach($video_prefixes as $prefix) {
-            $quoted = preg_quote($prefix, "/");
-            $pattern = "/$quoted/";
+        static $patterns = array();
+
+        if(strlen($url) <= 0 ) {
+            return false;
+        }
+        if($patterns == array()) {
+            foreach($video_prefixes as $prefix) {
+                $quoted = preg_quote($prefix, "/");
+                $patterns[] = "/$quoted/";
+            }
+        }
+
+        foreach($patterns as $pattern) {
             if(preg_match($pattern, $url) > 0) {
                 return true;
             }
diff --git a/lib/utility.php b/lib/utility.php
index 7a2a5b80b..b1c24e8af 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -489,7 +489,8 @@ function unbase64Hash($base64)
  */
 function webencode($str)
 {
-    $str = str_replace("/", "_", base64_encode($str));
+    $str = base64_encode($str);
+    $str = str_replace("/", "_", $str);
     $str = str_replace("+", ".", $str);
     $str = str_replace("=", "~", $str);
     return $str;
@@ -505,7 +506,8 @@ function webdecode($str)
 {
     $str = str_replace("_", "/", $str);
     $str = str_replace(".", "+", $str);
-    return base64_decode(str_replace("~", "=", $str));
+    $str = str_replace("~", "=", $str);
+    return base64_decode($str);
 }

 /**
ViewGit