Adds media:video and safe:true safe:false meta words, a=chris

Chris Pollett [2012-05-05 09:May:th]
Adds media:video and safe:true safe:false meta words, a=chris
Filename
bin/fetcher.php
controllers/statistics_controller.php
lib/crawl_daemon.php
lib/phrase_parser.php
lib/processors/epub_processor.php
lib/processors/sitemap_processor.php
lib/url_parser.php
models/phrase_model.php
tests/phrase_parser_test.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index a9f991cec..b87b51f16 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -1690,10 +1690,18 @@ class Fetcher implements CrawlConstants
                 if(isset($site[self::LANG])) {
                     $lang = $site[self::LANG];
                 }
-
+
                 $word_lists =
                     PhraseParser::extractPhrasesInLists($phrase_string,
                         $lang, true);
+                if(PhraseParser::computeSafeSearchScore($phrase_string) <
+                    0.025) {
+                    $meta_ids[] = "safe:true";
+                    $safe = true;
+                } else {
+                    $meta_ids[] = "safe:false";
+                    $safe = false;
+                }
             }

             $link_phrase_string = "";
@@ -1752,7 +1760,15 @@ class Fetcher implements CrawlConstants
                     $this->found_sites[self::SEEN_URLS][] = $summary;
                     $link_type = UrlParser::getDocumentType($url);
                     if(in_array($link_type, $IMAGE_TYPES)) {
+                        if(isset($safe) && !$safe) {
+                            $link_meta_ids[] = "safe:false";
+                        }
                         $link_meta_ids[] = "media:image";
+                    } else if(UrlParser::isVideoUrl($url)) {
+                        $link_meta_ids[] = "media:video";
+                        if(isset($safe) && !$safe) {
+                            $link_meta_ids[] = "safe:false";
+                        }
                     } else {
                         $link_meta_ids[] = "media:text";
                     }
@@ -1871,9 +1887,13 @@ class Fetcher implements CrawlConstants
         foreach($site[self::IP_ADDRESSES] as $address) {
             $meta_ids[] = 'ip:'.$address;
         }
-        $meta_ids[] = (stripos($site[self::TYPE], "image") !== false) ?
-            'media:image' : 'media:text';

+        if(UrlParser::isVideoUrl($site[self::URL])) {
+            $meta_ids[] = "media:video";
+        } else {
+            $meta_ids[] = (stripos($site[self::TYPE], "image") !== false) ?
+                'media:image' : 'media:text';
+        }
         // store the filetype info
         $url_type = UrlParser::getDocumentType($site[self::URL]);
         if(strlen($url_type) > 0) {
diff --git a/controllers/statistics_controller.php b/controllers/statistics_controller.php
index 7d9488e87..030ac794d 100644
--- a/controllers/statistics_controller.php
+++ b/controllers/statistics_controller.php
@@ -249,7 +249,7 @@ class StatisticsController extends Controller implements CrawlConstants
                 'tn', 'to', 'tr', 'ts', 'tt', 'tw', 'ty', 'ug', 'uk', 'ur',
                 'uz', 've', 'vi', 'vo', 'wa', 'wo', 'xh', 'yi', 'yo', 'za',
                 'zh', 'zu'),
-            "MEDIA" => array("image", "text"),
+            "MEDIA" => array("image", "text", "video"),
             "OS" => array("asianux", "centos", "clearos", "debian", "fedora",
                 "freebsd", "gentoo", "linux", "netware", "solaris", "sunos",
                 "ubuntu", "unix"),
@@ -370,7 +370,7 @@ class StatisticsController extends Controller implements CrawlConstants
     {
         $results = $this->phraseModel->getPhrasePageResults(
             "$query i:{$this->index_time_stamp}", 0,
-            1, true, NULL, true, 0, $this->machine_urls);
+            1, true, NULL, false, 0, $this->machine_urls);
         return (isset($results["TOTAL_ROWS"])) ? $results["TOTAL_ROWS"] : -1;
     }

diff --git a/lib/crawl_daemon.php b/lib/crawl_daemon.php
index 221e4035a..90f18874d 100644
--- a/lib/crawl_daemon.php
+++ b/lib/crawl_daemon.php
@@ -72,7 +72,7 @@ class CrawlDaemon implements CrawlConstants
      *  which fetcher daemon instance.
      *
      * @var string
-     * @var static
+     * @static
      */
      static $subname;

diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index 9db9134b8..b3c00bcf8 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -37,6 +37,8 @@ foreach(glob(LOCALE_DIR."/*/resources/tokenizer.php")
     as $filename) {
     require_once $filename;
 }
+$GLOBALS["CHARGRAMS"] = $CHARGRAMS;
+
 /**
  * Load the n word grams File
  */
@@ -146,10 +148,9 @@ class PhraseParser
         $lang = NULL, $orig_and_grams = false, $phrases_and_terms = true)
     {
         $phrase_lists = array();
-
         self::canonicalizePunctuatedTerms($string, $lang);
         $pre_phrases =
-            self::extractTermsAndFilterPhrases($string, $lang,$orig_and_grams);
+            self::extractTermsAndFilterPhrases($string, $lang, $orig_and_grams);
         $phrases = array();
         $j = 0;
         foreach($pre_phrases as $pre_phrase) {
@@ -236,7 +237,7 @@ class PhraseParser
         global $CHARGRAMS;

         mb_internal_encoding("UTF-8");
-        //split first on puctuation as n word grams shouldn't cross punctuation
+        //split first on punctuation as n word grams shouldn't cross punctuation
         $fragments = mb_split(PUNCT, $string);

         $final_terms = array();
@@ -379,6 +380,8 @@ class PhraseParser
      */
     static function getStemmer($lang)
     {
+        mb_regex_encoding('UTF-8');
+        mb_internal_encoding("UTF-8");
         $lower_lang = strtolower($lang); //try to avoid case sensitivity issues
         $lang_parts = explode("-", $lang);
         if(isset($lang_parts[1])) {
@@ -397,4 +400,58 @@ class PhraseParser
         }
         return $stem_obj;
     }
+
+    /**
+     *  Scores documents according to the lack or nonlack of sexually explicit
+     *  terms. Tries to work for several languages.
+     *
+     *  @param string $text passage to score
+     *  @return int $score of how explicit document is
+     */
+    static function computeSafeSearchScore($text)
+    {
+        static $unsafe_phrase = "
+XXX sex slut nymphomaniac MILF lolita lesbian sadomasochism
+bondage fisting erotic vagina Tribadism penis facial hermaphrodite
+transsexual tranny bestiality snuff boob fondle tit
+blowjob lap cock dick hardcore pr0n fuck pussy penetration ass
+cunt bisexual prostitution screw ass masturbation clitoris clit suck whore bitch
+bellaco cachar chingar shimar chinquechar chichar clavar coger culear hundir
+joder mámalo singar cojon carajo caray bicho concha chucha chocha
+chuchamadre coño panocha almeja culo fundillo fundío puta puto teta
+connorito cul pute putain sexe pénis vulve foutre baiser sein nicher nichons
+puta sapatão foder ferro punheta vadia buceta bucetinha bunda caralho
+mentula cunnus verpa sōpiō pipinna cōleī cunnilingus futuō copulate cēveō crīsō
+scortor meretrīx futatrix minchia coglione cornuto culo inocchio frocio puttana
+vaffanculo fok hoer kut lul やりまん 打っ掛け  二形 ふたなりゴックン ゴックン
+ショタコン 全裸 受け 裏本 пизда́ хуй еба́ть блядь елда́ гондо́н хер манда́ му́ди мудя
+пидора́с залу́па жо́па за́дница буфер
+雞巴 鷄巴 雞雞 鷄鷄 阴茎 陰莖 胯下物
+屌 吊 小鳥 龟头 龜頭 屄 鸡白 雞白 傻屄 老二 那话儿 那話兒 屄 鸡白 雞白 阴道 陰道
+阴户 陰戶 大姨妈 淫蟲 老嫖 妓女 臭婊子 卖豆腐 賣豆腐 咪咪 大豆腐 爆乳 肏操
+炒饭 炒飯 cặc lồn kaltak orospu siktir sıçmak amcık ";
+        static $unsafe_terms = array();
+
+        if($unsafe_terms == array()) {
+           $pre_unsafe_terms = mb_split("(\s)+", $unsafe_phrase);
+            foreach($pre_unsafe_terms as $pre_unsafe) {
+                if(strlen($pre_unsafe) > 0) {
+                    $unsafe_terms[] = $pre_unsafe;
+                }
+            }
+        }
+
+        $num_unsafe_terms = 0;
+        $unsafe_count = 0;
+        foreach($unsafe_terms as $term) {
+            $count = mb_substr_count($text, $term);
+            if($count > 0 ) {
+                $unsafe_count += $count;
+                $num_unsafe_terms++;
+            }
+        }
+
+        $score = $num_unsafe_terms * $unsafe_count/(strlen($text) + 1);
+        return $score;
+    }
 }
diff --git a/lib/processors/epub_processor.php b/lib/processors/epub_processor.php
index 0e8522017..e76db492f 100644
--- a/lib/processors/epub_processor.php
+++ b/lib/processors/epub_processor.php
@@ -51,18 +51,14 @@ require_once BASE_DIR."/lib/url_parser.php";

 /**
  *  The maximum length of description
- *
- *  @const integer MAX_DESCRIPTION_LEN
-*/
+ */
 const MAX_DESCRIPTION_LEN = 2000;

 /**
  * The constant represents the number of
  * child levels at which the data is present in
  * the content.opf file.
- *
- *  @const integer MAX_DOM_LEVEL
-*/
+ */
 const MAX_DOM_LEVEL = 15;

  /**
diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php
index 98c4be944..21545ab13 100644
--- a/lib/processors/sitemap_processor.php
+++ b/lib/processors/sitemap_processor.php
@@ -136,7 +136,7 @@ class SitemapProcessor extends TextProcessor

         $i = 0;
         foreach($paths as $path) {
-            $nodes = $xpath->evaluate($path);
+            $nodes = @$xpath->evaluate($path);
             foreach($nodes as $node) {
                 $url = UrlParser::canonicalLink(
                     $node->textContent, $site);
diff --git a/lib/url_parser.php b/lib/url_parser.php
index 2a3085fee..053b0b875 100755
--- a/lib/url_parser.php
+++ b/lib/url_parser.php
@@ -700,6 +700,34 @@ class UrlParser
         }
         return false;
     }
+
+    /**
+     * Checks if a URL corresponds to a known playback page of a video
+     * sharing site
+     *
+     * @param string $url the url to check
+     * @return bool whether or not corresponds to video playback page of a known
+     *      video site
+     */
+    static function isVideoUrl($url)
+    {
+        $video_prefixes = array("http://www.youtube.com/watch?v=",
+            "http://www.metacafe.com/watch/",
+            "http://screen.yahoo.com/",
+            "http://player.vimeo.com/video/",
+            "http://archive.org/movies/thumbnails.php?identifier=",
+            "http://www.dailymotion.com/video/",
+            "http://v.youku.com/v_playlist/",
+            "http://www.break.com/index/");
+        foreach($video_prefixes as $prefix) {
+            $quoted = preg_quote($prefix, "/");
+            $pattern = "/$quoted/";
+            if(preg_match($pattern, $url) > 0) {
+                return true;
+            }
+        }
+        return false;
+    }
 }

 ?>
diff --git a/models/phrase_model.php b/models/phrase_model.php
index f15d677c4..db147d777 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -105,7 +105,7 @@ class PhraseModel extends Model
             'filetype:', 'info:', '\-', 'os:', 'server:', 'date:', "numlinks:",
             'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:', 'time:', 'code:',
             'lang:', 'media:', 'elink:', 'location:', 'size:', 'host:', 'dns:',
-            'path:', 'robot:');
+            'path:', 'robot:', 'safe:');

     /**
      * Number of pages to cache in one go in memcache or filecache
diff --git a/tests/phrase_parser_test.php b/tests/phrase_parser_test.php
index 61c595271..b3c1b0482 100644
--- a/tests/phrase_parser_test.php
+++ b/tests/phrase_parser_test.php
@@ -63,7 +63,8 @@ class PhraseParserTest extends UnitTest
     }

     /**
-     *
+     * Tests the ability of extractPhrasesInLists to extract some hard-case
+     * phrases and acronyms
      */
     public function extractPhrasesTestCase()
     {
@@ -97,14 +98,15 @@ EOD;
 拼音 关闭 空间 百科 hao123 | 更多>>
 About Baidu
 EOD;
+
         $word_lists = PhraseParser::extractPhrasesInLists($phrase_string,
             "zh-CN", true);
         $words = array_keys($word_lists);
         $this->assertTrue(in_array("百度", $words), "Chinese test 1");
         $this->assertTrue(in_array("mp3", $words), "Chinese test 2");
         $this->assertTrue(in_array("ab", $words), "Chinese test 3");
-        $this->assertFalse(in_array("", $words), "Chinese test 3");
-        $this->assertFalse(in_array("下,", $words), "Chinese test 4");
+        $this->assertFalse(in_array("", $words), "Chinese test 4");
+        $this->assertFalse(in_array("下,", $words), "Chinese test 5");

         $phrase_string = <<< EOD
 P.O. Box 765,  http://somewhere.edu.au
@@ -138,6 +140,85 @@ EOD;
         $this->assertTrue(in_array(
             "http_c__s__s_yo_d_lo_d_edu_s_faculty_pages_s_zebra_s_",
             $words), "URL Check 2");
+    }
+
+    /**
+     * Checks whether the same search threshold can classify porn from
+     * non-porn sites. Sample were taken from a couple porn sites,
+     * sorted alphabetically by word and then some of the non sensitive words
+     * were substituted so as to avoid copyright issues. For the safe tests
+     * a similar process was done with the Wizard of Oz (now public domain)
+     * and with some sexually related Wikipedia articles (Creative Commons SA).
+     */
+    public function computeSafeSearchScoreTestCase()
+    {
+        $phrase_string = <<< EOD
+a a a a a a a a a a a a all and and
+and and and and and another any arose at aunt aunt be bed bed beds big
+build building by by called carried case cellar cellar chairs contained
+cookstove corner corner could crush cupboard cyclone dark dishes door
+dorothy dorothy down dug em em enough except family farmer farmer's floor
+floor for for four four from garret go great great ground had had henry
+henry hole hole house in in in in in in in into it it its kansas ladder led
+little lived looking lumber made many middle midst mighty miles no no of of
+of one one one or path prairies reached roof room room rusty small small
+small table the the the the the the the the the the the their there there
+this those three to to to trap uncle uncle wagon walls was was was was was
+were where which which whirlwinds who who wife with
+EOD;
+        $score = PhraseParser::computeSafeSearchScore($phrase_string);
+        $this->assertTrue(($score < 0.025), "Easy Safe Test 1");
+
+        $phrase_string = <<< EOD
+a afraid all and anon baby big boobs but cock crave dicking does
+for from grown has how in is isnt knot lolita matts monster pussies ready
+she she shew slut teens their thom them thought they're tight to to to total
+up use whether
+EOD;
+        $score = PhraseParser::computeSafeSearchScore($phrase_string);
+        $this->assertTrue(($score > 0.025), "Easy Unsafe Test 1");
+
+        $phrase_string = <<< EOD
+a a a a a adventure after all alotta amazing and and and and and
+and and and and and around as ball ball big body boobies bounce boy
+brunhilda came check check chilled cirque do enjoy ensued exercises
+flap friends fucking fucking give going gorge got got grabbing had
+had had has he hell her her horny i if in it it it it it jog junk
+just know little little loved me mean melons melons my my of on out out
+ploy precious kitties see she she she sought sizzle so so spent spicy
+started stretch sucking swinging that that that the the the then things
+those those those tit titties titty to to to togo today tramp truly
+us was we we we what what when what wild with with with workout wrap yes
+you
+EOD;
+        $score = PhraseParser::computeSafeSearchScore($phrase_string);
+        $this->assertTrue(($score > 0.025), "Harder Unsafe Test 1");
+
+        $phrase_string = <<< EOD
+amino hog known a a a a an and and
+and and are are as as asymmetry be biology both but can cases cells
+combining combining contain deem distance each early evolved exist
+female female for firm firm from function gametes gametes gametes gametes
+genetic genentech has ideal in in in in information disinherit into intone
+is isopod known large mole mole many mixing motile motile necessary
+non nutrients of of of of offspring often optimized or organism organisms
+over parents process reproduce reproduce result sex sex sexual
+sexual small specialist specialized specific such that that the the the
+the their to to traits traits transport two types variety while young
+EOD;
+        $score = PhraseParser::computeSafeSearchScore($phrase_string);
+        $this->assertTrue(($score < 0.025), "Harder Safe Test 1");
+
+        $phrase_string = <<< EOD
+a a active adverb an an and are as as as attribute be
+between by caught characterized daft describe describe desire desire deft
+french female female females having homosexuality identify in is language
+lesbian may moist verb object of of or or or others secondary refer relay
+romantic same sex sexual trim the the the them to to to to to used
+used who who wide women ward
+EOD;
+        $score = PhraseParser::computeSafeSearchScore($phrase_string);
+        $this->assertTrue(($score < 0.025), "Harder Safe Test 2");

     }
 }
ViewGit