diff --git a/bin/fetcher.php b/bin/fetcher.php index 56f3c3e2a..90eb49bb1 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -1694,8 +1694,9 @@ class Fetcher implements CrawlConstants $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang, true); - if(PhraseParser::computeSafeSearchScore($phrase_string) < - 0.025) { + $len = strlen($phrase_string); + if(PhraseParser::computeSafeSearchScore($word_lists, $len) < + 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php index 26962f469..3926b3c40 100755 --- a/lib/phrase_parser.php +++ b/lib/phrase_parser.php @@ -405,10 +405,11 @@ class PhraseParser * Scores documents according to the lack or nonlack of sexually explicit * terms. Tries to work for several languages. * - * @param string $text passage to score + * @param array $word_lists word => pos_list tuples + * @param int $len length of text being examined in characters * @return int $score of how explicit document is */ - static function computeSafeSearchScore($text) + static function computeSafeSearchScore($word_lists, $len) { static $unsafe_phrase = " XXX sex slut nymphomaniac MILF lolita lesbian sadomasochism @@ -433,25 +434,26 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け 二形 ふたなりゴッ static $unsafe_terms = array(); if($unsafe_terms == array()) { - $pre_unsafe_terms = mb_split("(\s)+", $unsafe_phrase); - foreach($pre_unsafe_terms as $pre_unsafe) { - if(strlen($pre_unsafe) > 0) { - $unsafe_terms[] = $pre_unsafe; - } - } + $unsafe_lists = PhraseParser::extractPhrasesInLists($unsafe_phrase, + "en-US", true); + $unsafe_terms = array_keys($unsafe_lists); } $num_unsafe_terms = 0; $unsafe_count = 0; - foreach($unsafe_terms as $term) { - $count = mb_substr_count($text, $term); + $words = array_keys($word_lists); + + $unsafe_found = array_intersect($words, $unsafe_terms); + + foreach($unsafe_found as $term) { + $count = count($word_lists[$term]); if($count > 0 ) { $unsafe_count += $count; $num_unsafe_terms++; } } - $score = $num_unsafe_terms * $unsafe_count/(strlen($text) + 1); + $score = $num_unsafe_terms * $unsafe_count/($len + 1); return $score; } } diff --git a/lib/url_parser.php b/lib/url_parser.php index 053b0b875..89b8ab078 100755 --- a/lib/url_parser.php +++ b/lib/url_parser.php @@ -705,11 +705,11 @@ class UrlParser * Checks if a URL corresponds to a known playback page of a video * sharing site * - * @param string $url the url to check + * @param string &$url the url to check * @return bool whether or not corresponds to video playback page of a known * video site */ - static function isVideoUrl($url) + static function isVideoUrl(&$url) { $video_prefixes = array("http://www.youtube.com/watch?v=", "http://www.metacafe.com/watch/", diff --git a/tests/phrase_parser_test.php b/tests/phrase_parser_test.php index b3c1b0482..3b9c2d253 100644 --- a/tests/phrase_parser_test.php +++ b/tests/phrase_parser_test.php @@ -166,8 +166,11 @@ small table the the the the the the the the the the the their there there this those three to to to trap uncle uncle wagon walls was was was was was were where which which whirlwinds who who wife with EOD; - $score = PhraseParser::computeSafeSearchScore($phrase_string); - $this->assertTrue(($score < 0.025), "Easy Safe Test 1"); + $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, + "en-US", true); + $len = strlen($phrase_string); + $score = PhraseParser::computeSafeSearchScore($word_lists, $len); + $this->assertTrue(($score < 0.012), "Easy Safe Test 1"); $phrase_string = <<< EOD a afraid all and anon baby big boobs but cock crave dicking does @@ -175,8 +178,11 @@ for from grown has how in is isnt knot lolita matts monster pussies ready she she shew slut teens their thom them thought they're tight to to to total up use whether EOD; - $score = PhraseParser::computeSafeSearchScore($phrase_string); - $this->assertTrue(($score > 0.025), "Easy Unsafe Test 1"); + $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, + "en-US", true); + $len = strlen($phrase_string); + $score = PhraseParser::computeSafeSearchScore($word_lists, $len); + $this->assertTrue(($score > 0.012), "Easy Unsafe Test 1"); $phrase_string = <<< EOD a a a a a adventure after all alotta amazing and and and and and @@ -191,8 +197,11 @@ those those those tit titties titty to to to togo today tramp truly us was we we we what what when what wild with with with workout wrap yes you EOD; - $score = PhraseParser::computeSafeSearchScore($phrase_string); - $this->assertTrue(($score > 0.025), "Harder Unsafe Test 1"); + $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, + "en-US", true); + $len = strlen($phrase_string); + $score = PhraseParser::computeSafeSearchScore($word_lists, $len); + $this->assertTrue(($score > 0.012), "Harder Unsafe Test 1"); $phrase_string = <<< EOD amino hog known a a a a an and and @@ -206,8 +215,11 @@ over parents process reproduce reproduce result sex sex sexual sexual small specialist specialized specific such that that the the the the their to to traits traits transport two types variety while young EOD; - $score = PhraseParser::computeSafeSearchScore($phrase_string); - $this->assertTrue(($score < 0.025), "Harder Safe Test 1"); + $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, + "en-US", true); + $len = strlen($phrase_string); + $score = PhraseParser::computeSafeSearchScore($word_lists, $len); + $this->assertTrue(($score < 0.012), "Harder Safe Test 1"); $phrase_string = <<< EOD a a active adverb an an and are as as as attribute be @@ -217,8 +229,11 @@ lesbian may moist verb object of of or or or others secondary refer relay romantic same sex sexual trim the the the them to to to to to used used who who wide women ward EOD; - $score = PhraseParser::computeSafeSearchScore($phrase_string); - $this->assertTrue(($score < 0.025), "Harder Safe Test 2"); + $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, + "en-US", true); + $len = strlen($phrase_string); + $score = PhraseParser::computeSafeSearchScore($word_lists, $len); + $this->assertTrue(($score < 0.012), "Harder Safe Test 2"); } }