Tries to compute safe score a different way to see if reduces memory consumption, a=chris

Chris Pollett [2012-05-06 00:May:th]
Tries to compute safe score a different way to see if reduces memory consumption, a=chris
Filename
bin/fetcher.php
lib/phrase_parser.php
lib/url_parser.php
tests/phrase_parser_test.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 56f3c3e2a..90eb49bb1 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -1694,8 +1694,9 @@ class Fetcher implements CrawlConstants
                 $word_lists =
                     PhraseParser::extractPhrasesInLists($phrase_string,
                         $lang, true);
-                if(PhraseParser::computeSafeSearchScore($phrase_string) <
-                    0.025) {
+                $len = strlen($phrase_string);
+                if(PhraseParser::computeSafeSearchScore($word_lists, $len) <
+                    0.012) {
                     $meta_ids[] = "safe:true";
                     $safe = true;
                 } else {
diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index 26962f469..3926b3c40 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -405,10 +405,11 @@ class PhraseParser
      *  Scores documents according to the lack or nonlack of sexually explicit
      *  terms. Tries to work for several languages.
      *
-     *  @param string $text passage to score
+     *  @param array $word_lists word => pos_list tuples
+     *  @param int $len length of text being examined in characters
      *  @return int $score of how explicit document is
      */
-    static function computeSafeSearchScore($text)
+    static function computeSafeSearchScore($word_lists, $len)
     {
         static $unsafe_phrase = "
 XXX sex slut nymphomaniac MILF lolita lesbian sadomasochism
@@ -433,25 +434,26 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け  二形 ふたなりゴッ
         static $unsafe_terms = array();

         if($unsafe_terms == array()) {
-           $pre_unsafe_terms = mb_split("(\s)+", $unsafe_phrase);
-            foreach($pre_unsafe_terms as $pre_unsafe) {
-                if(strlen($pre_unsafe) > 0) {
-                    $unsafe_terms[] = $pre_unsafe;
-                }
-            }
+            $unsafe_lists = PhraseParser::extractPhrasesInLists($unsafe_phrase,
+                "en-US", true);
+            $unsafe_terms = array_keys($unsafe_lists);
         }

         $num_unsafe_terms = 0;
         $unsafe_count = 0;
-        foreach($unsafe_terms as $term) {
-            $count = mb_substr_count($text, $term);
+        $words = array_keys($word_lists);
+
+        $unsafe_found = array_intersect($words, $unsafe_terms);
+
+        foreach($unsafe_found as $term) {
+            $count = count($word_lists[$term]);
             if($count > 0 ) {
                 $unsafe_count += $count;
                 $num_unsafe_terms++;
             }
         }

-        $score = $num_unsafe_terms * $unsafe_count/(strlen($text) + 1);
+        $score = $num_unsafe_terms * $unsafe_count/($len + 1);
         return $score;
     }
 }
diff --git a/lib/url_parser.php b/lib/url_parser.php
index 053b0b875..89b8ab078 100755
--- a/lib/url_parser.php
+++ b/lib/url_parser.php
@@ -705,11 +705,11 @@ class UrlParser
      * Checks if a URL corresponds to a known playback page of a video
      * sharing site
      *
-     * @param string $url the url to check
+     * @param string &$url the url to check
      * @return bool whether or not corresponds to video playback page of a known
      *      video site
      */
-    static function isVideoUrl($url)
+    static function isVideoUrl(&$url)
     {
         $video_prefixes = array("http://www.youtube.com/watch?v=",
             "http://www.metacafe.com/watch/",
diff --git a/tests/phrase_parser_test.php b/tests/phrase_parser_test.php
index b3c1b0482..3b9c2d253 100644
--- a/tests/phrase_parser_test.php
+++ b/tests/phrase_parser_test.php
@@ -166,8 +166,11 @@ small table the the the the the the the the the the the their there there
 this those three to to to trap uncle uncle wagon walls was was was was was
 were where which which whirlwinds who who wife with
 EOD;
-        $score = PhraseParser::computeSafeSearchScore($phrase_string);
-        $this->assertTrue(($score < 0.025), "Easy Safe Test 1");
+        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+            "en-US", true);
+        $len = strlen($phrase_string);
+        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
+        $this->assertTrue(($score < 0.012), "Easy Safe Test 1");

         $phrase_string = <<< EOD
 a afraid all and anon baby big boobs but cock crave dicking does
@@ -175,8 +178,11 @@ for from grown has how in is isnt knot lolita matts monster pussies ready
 she she shew slut teens their thom them thought they're tight to to to total
 up use whether
 EOD;
-        $score = PhraseParser::computeSafeSearchScore($phrase_string);
-        $this->assertTrue(($score > 0.025), "Easy Unsafe Test 1");
+        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+            "en-US", true);
+        $len = strlen($phrase_string);
+        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
+        $this->assertTrue(($score > 0.012), "Easy Unsafe Test 1");

         $phrase_string = <<< EOD
 a a a a a adventure after all alotta amazing and and and and and
@@ -191,8 +197,11 @@ those those those tit titties titty to to to togo today tramp truly
 us was we we we what what when what wild with with with workout wrap yes
 you
 EOD;
-        $score = PhraseParser::computeSafeSearchScore($phrase_string);
-        $this->assertTrue(($score > 0.025), "Harder Unsafe Test 1");
+        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+            "en-US", true);
+        $len = strlen($phrase_string);
+        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
+        $this->assertTrue(($score > 0.012), "Harder Unsafe Test 1");

         $phrase_string = <<< EOD
 amino hog known a a a a an and and
@@ -206,8 +215,11 @@ over parents process reproduce reproduce result sex sex sexual
 sexual small specialist specialized specific such that that the the the
 the their to to traits traits transport two types variety while young
 EOD;
-        $score = PhraseParser::computeSafeSearchScore($phrase_string);
-        $this->assertTrue(($score < 0.025), "Harder Safe Test 1");
+        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+            "en-US", true);
+        $len = strlen($phrase_string);
+        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
+        $this->assertTrue(($score < 0.012), "Harder Safe Test 1");

         $phrase_string = <<< EOD
 a a active adverb an an and are as as as attribute be
@@ -217,8 +229,11 @@ lesbian may moist verb object of of or or or others secondary refer relay
 romantic same sex sexual trim the the the them to to to to to used
 used who who wide women ward
 EOD;
-        $score = PhraseParser::computeSafeSearchScore($phrase_string);
-        $this->assertTrue(($score < 0.025), "Harder Safe Test 2");
+        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
+            "en-US", true);
+        $len = strlen($phrase_string);
+        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
+        $this->assertTrue(($score < 0.012), "Harder Safe Test 2");

     }
 }
ViewGit