diff --git a/bin/fetcher.php b/bin/fetcher.php index a9f991cec..b87b51f16 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -1690,10 +1690,18 @@ class Fetcher implements CrawlConstants if(isset($site[self::LANG])) { $lang = $site[self::LANG]; } - + $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang, true); + if(PhraseParser::computeSafeSearchScore($phrase_string) < + 0.025) { + $meta_ids[] = "safe:true"; + $safe = true; + } else { + $meta_ids[] = "safe:false"; + $safe = false; + } } $link_phrase_string = ""; @@ -1752,7 +1760,15 @@ class Fetcher implements CrawlConstants $this->found_sites[self::SEEN_URLS][] = $summary; $link_type = UrlParser::getDocumentType($url); if(in_array($link_type, $IMAGE_TYPES)) { + if(isset($safe) && !$safe) { + $link_meta_ids[] = "safe:false"; + } $link_meta_ids[] = "media:image"; + } else if(UrlParser::isVideoUrl($url)) { + $link_meta_ids[] = "media:video"; + if(isset($safe) && !$safe) { + $link_meta_ids[] = "safe:false"; + } } else { $link_meta_ids[] = "media:text"; } @@ -1871,9 +1887,13 @@ class Fetcher implements CrawlConstants foreach($site[self::IP_ADDRESSES] as $address) { $meta_ids[] = 'ip:'.$address; } - $meta_ids[] = (stripos($site[self::TYPE], "image") !== false) ? - 'media:image' : 'media:text'; + if(UrlParser::isVideoUrl($site[self::URL])) { + $meta_ids[] = "media:video"; + } else { + $meta_ids[] = (stripos($site[self::TYPE], "image") !== false) ? + 'media:image' : 'media:text'; + } // store the filetype info $url_type = UrlParser::getDocumentType($site[self::URL]); if(strlen($url_type) > 0) { diff --git a/controllers/statistics_controller.php b/controllers/statistics_controller.php index 7d9488e87..030ac794d 100644 --- a/controllers/statistics_controller.php +++ b/controllers/statistics_controller.php @@ -249,7 +249,7 @@ class StatisticsController extends Controller implements CrawlConstants 'tn', 'to', 'tr', 'ts', 'tt', 'tw', 'ty', 'ug', 'uk', 'ur', 'uz', 've', 'vi', 'vo', 'wa', 'wo', 'xh', 'yi', 'yo', 'za', 'zh', 'zu'), - "MEDIA" => array("image", "text"), + "MEDIA" => array("image", "text", "video"), "OS" => array("asianux", "centos", "clearos", "debian", "fedora", "freebsd", "gentoo", "linux", "netware", "solaris", "sunos", "ubuntu", "unix"), @@ -370,7 +370,7 @@ class StatisticsController extends Controller implements CrawlConstants { $results = $this->phraseModel->getPhrasePageResults( "$query i:{$this->index_time_stamp}", 0, - 1, true, NULL, true, 0, $this->machine_urls); + 1, true, NULL, false, 0, $this->machine_urls); return (isset($results["TOTAL_ROWS"])) ? $results["TOTAL_ROWS"] : -1; } diff --git a/lib/crawl_daemon.php b/lib/crawl_daemon.php index 221e4035a..90f18874d 100644 --- a/lib/crawl_daemon.php +++ b/lib/crawl_daemon.php @@ -72,7 +72,7 @@ class CrawlDaemon implements CrawlConstants * which fetcher daemon instance. * * @var string - * @var static + * @static */ static $subname; diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php index 9db9134b8..b3c00bcf8 100755 --- a/lib/phrase_parser.php +++ b/lib/phrase_parser.php @@ -37,6 +37,8 @@ foreach(glob(LOCALE_DIR."/*/resources/tokenizer.php") as $filename) { require_once $filename; } +$GLOBALS["CHARGRAMS"] = $CHARGRAMS; + /** * Load the n word grams File */ @@ -146,10 +148,9 @@ class PhraseParser $lang = NULL, $orig_and_grams = false, $phrases_and_terms = true) { $phrase_lists = array(); - self::canonicalizePunctuatedTerms($string, $lang); $pre_phrases = - self::extractTermsAndFilterPhrases($string, $lang,$orig_and_grams); + self::extractTermsAndFilterPhrases($string, $lang, $orig_and_grams); $phrases = array(); $j = 0; foreach($pre_phrases as $pre_phrase) { @@ -236,7 +237,7 @@ class PhraseParser global $CHARGRAMS; mb_internal_encoding("UTF-8"); - //split first on puctuation as n word grams shouldn't cross punctuation + //split first on punctuation as n word grams shouldn't cross punctuation $fragments = mb_split(PUNCT, $string); $final_terms = array(); @@ -379,6 +380,8 @@ class PhraseParser */ static function getStemmer($lang) { + mb_regex_encoding('UTF-8'); + mb_internal_encoding("UTF-8"); $lower_lang = strtolower($lang); //try to avoid case sensitivity issues $lang_parts = explode("-", $lang); if(isset($lang_parts[1])) { @@ -397,4 +400,58 @@ class PhraseParser } return $stem_obj; } + + /** + * Scores documents according to the lack or nonlack of sexually explicit + * terms. Tries to work for several languages. + * + * @param string $text passage to score + * @return int $score of how explicit document is + */ + static function computeSafeSearchScore($text) + { + static $unsafe_phrase = " +XXX sex slut nymphomaniac MILF lolita lesbian sadomasochism +bondage fisting erotic vagina Tribadism penis facial hermaphrodite +transsexual tranny bestiality snuff boob fondle tit +blowjob lap cock dick hardcore pr0n fuck pussy penetration ass +cunt bisexual prostitution screw ass masturbation clitoris clit suck whore bitch +bellaco cachar chingar shimar chinquechar chichar clavar coger culear hundir +joder mámalo singar cojon carajo caray bicho concha chucha chocha +chuchamadre coño panocha almeja culo fundillo fundío puta puto teta +connorito cul pute putain sexe pénis vulve foutre baiser sein nicher nichons +puta sapatão foder ferro punheta vadia buceta bucetinha bunda caralho +mentula cunnus verpa sōpiō pipinna cōleī cunnilingus futuō copulate cēveō crīsō +scortor meretrīx futatrix minchia coglione cornuto culo inocchio frocio puttana +vaffanculo fok hoer kut lul やりまん 打っ掛け 二形 ふたなりゴックン ゴックン +ショタコン 全裸 受け 裏本 пизда́ хуй еба́ть блядь елда́ гондо́н хер манда́ му́ди мудя +пидора́с залу́па жо́па за́дница буфер +雞巴 鷄巴 雞雞 鷄鷄 阴茎 陰莖 胯下物 +屌 吊 小鳥 龟头 龜頭 屄 鸡白 雞白 傻屄 老二 那话儿 那話兒 屄 鸡白 雞白 阴道 陰道 +阴户 陰戶 大姨妈 淫蟲 老嫖 妓女 臭婊子 卖豆腐 賣豆腐 咪咪 大豆腐 爆乳 肏操 +炒饭 炒飯 cặc lồn kaltak orospu siktir sıçmak amcık "; + static $unsafe_terms = array(); + + if($unsafe_terms == array()) { + $pre_unsafe_terms = mb_split("(\s)+", $unsafe_phrase); + foreach($pre_unsafe_terms as $pre_unsafe) { + if(strlen($pre_unsafe) > 0) { + $unsafe_terms[] = $pre_unsafe; + } + } + } + + $num_unsafe_terms = 0; + $unsafe_count = 0; + foreach($unsafe_terms as $term) { + $count = mb_substr_count($text, $term); + if($count > 0 ) { + $unsafe_count += $count; + $num_unsafe_terms++; + } + } + + $score = $num_unsafe_terms * $unsafe_count/(strlen($text) + 1); + return $score; + } } diff --git a/lib/processors/epub_processor.php b/lib/processors/epub_processor.php index 0e8522017..e76db492f 100644 --- a/lib/processors/epub_processor.php +++ b/lib/processors/epub_processor.php @@ -51,18 +51,14 @@ require_once BASE_DIR."/lib/url_parser.php"; /** * The maximum length of description - * - * @const integer MAX_DESCRIPTION_LEN -*/ + */ const MAX_DESCRIPTION_LEN = 2000; /** * The constant represents the number of * child levels at which the data is present in * the content.opf file. - * - * @const integer MAX_DOM_LEVEL -*/ + */ const MAX_DOM_LEVEL = 15; /** diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php index 98c4be944..21545ab13 100644 --- a/lib/processors/sitemap_processor.php +++ b/lib/processors/sitemap_processor.php @@ -136,7 +136,7 @@ class SitemapProcessor extends TextProcessor $i = 0; foreach($paths as $path) { - $nodes = $xpath->evaluate($path); + $nodes = @$xpath->evaluate($path); foreach($nodes as $node) { $url = UrlParser::canonicalLink( $node->textContent, $site); diff --git a/lib/url_parser.php b/lib/url_parser.php index 2a3085fee..053b0b875 100755 --- a/lib/url_parser.php +++ b/lib/url_parser.php @@ -700,6 +700,34 @@ class UrlParser } return false; } + + /** + * Checks if a URL corresponds to a known playback page of a video + * sharing site + * + * @param string $url the url to check + * @return bool whether or not corresponds to video playback page of a known + * video site + */ + static function isVideoUrl($url) + { + $video_prefixes = array("http://www.youtube.com/watch?v=", + "http://www.metacafe.com/watch/", + "http://screen.yahoo.com/", + "http://player.vimeo.com/video/", + "http://archive.org/movies/thumbnails.php?identifier=", + "http://www.dailymotion.com/video/", + "http://v.youku.com/v_playlist/", + "http://www.break.com/index/"); + foreach($video_prefixes as $prefix) { + $quoted = preg_quote($prefix, "/"); + $pattern = "/$quoted/"; + if(preg_match($pattern, $url) > 0) { + return true; + } + } + return false; + } } ?> diff --git a/models/phrase_model.php b/models/phrase_model.php index f15d677c4..db147d777 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -105,7 +105,7 @@ class PhraseModel extends Model 'filetype:', 'info:', '\-', 'os:', 'server:', 'date:', "numlinks:", 'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:', 'time:', 'code:', 'lang:', 'media:', 'elink:', 'location:', 'size:', 'host:', 'dns:', - 'path:', 'robot:'); + 'path:', 'robot:', 'safe:'); /** * Number of pages to cache in one go in memcache or filecache diff --git a/tests/phrase_parser_test.php b/tests/phrase_parser_test.php index 61c595271..b3c1b0482 100644 --- a/tests/phrase_parser_test.php +++ b/tests/phrase_parser_test.php @@ -63,7 +63,8 @@ class PhraseParserTest extends UnitTest } /** - * + * Tests the ability of extractPhrasesInLists to extract some hard-case + * phrases and acronyms */ public function extractPhrasesTestCase() { @@ -97,14 +98,15 @@ EOD; 拼音 关闭 空间 百科 hao123 | 更多>> About Baidu EOD; + $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "zh-CN", true); $words = array_keys($word_lists); $this->assertTrue(in_array("百度", $words), "Chinese test 1"); $this->assertTrue(in_array("mp3", $words), "Chinese test 2"); $this->assertTrue(in_array("ab", $words), "Chinese test 3"); - $this->assertFalse(in_array("", $words), "Chinese test 3"); - $this->assertFalse(in_array("下,", $words), "Chinese test 4"); + $this->assertFalse(in_array("", $words), "Chinese test 4"); + $this->assertFalse(in_array("下,", $words), "Chinese test 5"); $phrase_string = <<< EOD P.O. Box 765, http://somewhere.edu.au @@ -138,6 +140,85 @@ EOD; $this->assertTrue(in_array( "http_c__s__s_yo_d_lo_d_edu_s_faculty_pages_s_zebra_s_", $words), "URL Check 2"); + } + + /** + * Checks whether the same search threshold can classify porn from + * non-porn sites. Sample were taken from a couple porn sites, + * sorted alphabetically by word and then some of the non sensitive words + * were substituted so as to avoid copyright issues. For the safe tests + * a similar process was done with the Wizard of Oz (now public domain) + * and with some sexually related Wikipedia articles (Creative Commons SA). + */ + public function computeSafeSearchScoreTestCase() + { + $phrase_string = <<< EOD +a a a a a a a a a a a a all and and +and and and and and another any arose at aunt aunt be bed bed beds big +build building by by called carried case cellar cellar chairs contained +cookstove corner corner could crush cupboard cyclone dark dishes door +dorothy dorothy down dug em em enough except family farmer farmer's floor +floor for for four four from garret go great great ground had had henry +henry hole hole house in in in in in in in into it it its kansas ladder led +little lived looking lumber made many middle midst mighty miles no no of of +of one one one or path prairies reached roof room room rusty small small +small table the the the the the the the the the the the their there there +this those three to to to trap uncle uncle wagon walls was was was was was +were where which which whirlwinds who who wife with +EOD; + $score = PhraseParser::computeSafeSearchScore($phrase_string); + $this->assertTrue(($score < 0.025), "Easy Safe Test 1"); + + $phrase_string = <<< EOD +a afraid all and anon baby big boobs but cock crave dicking does +for from grown has how in is isnt knot lolita matts monster pussies ready +she she shew slut teens their thom them thought they're tight to to to total +up use whether +EOD; + $score = PhraseParser::computeSafeSearchScore($phrase_string); + $this->assertTrue(($score > 0.025), "Easy Unsafe Test 1"); + + $phrase_string = <<< EOD +a a a a a adventure after all alotta amazing and and and and and +and and and and and around as ball ball big body boobies bounce boy +brunhilda came check check chilled cirque do enjoy ensued exercises +flap friends fucking fucking give going gorge got got grabbing had +had had has he hell her her horny i if in it it it it it jog junk +just know little little loved me mean melons melons my my of on out out +ploy precious kitties see she she she sought sizzle so so spent spicy +started stretch sucking swinging that that that the the the then things +those those those tit titties titty to to to togo today tramp truly +us was we we we what what when what wild with with with workout wrap yes +you +EOD; + $score = PhraseParser::computeSafeSearchScore($phrase_string); + $this->assertTrue(($score > 0.025), "Harder Unsafe Test 1"); + + $phrase_string = <<< EOD +amino hog known a a a a an and and +and and are are as as asymmetry be biology both but can cases cells +combining combining contain deem distance each early evolved exist +female female for firm firm from function gametes gametes gametes gametes +genetic genentech has ideal in in in in information disinherit into intone +is isopod known large mole mole many mixing motile motile necessary +non nutrients of of of of offspring often optimized or organism organisms +over parents process reproduce reproduce result sex sex sexual +sexual small specialist specialized specific such that that the the the +the their to to traits traits transport two types variety while young +EOD; + $score = PhraseParser::computeSafeSearchScore($phrase_string); + $this->assertTrue(($score < 0.025), "Harder Safe Test 1"); + + $phrase_string = <<< EOD +a a active adverb an an and are as as as attribute be +between by caught characterized daft describe describe desire desire deft +french female female females having homosexuality identify in is language +lesbian may moist verb object of of or or or others secondary refer relay +romantic same sex sexual trim the the the them to to to to to used +used who who wide women ward +EOD; + $score = PhraseParser::computeSafeSearchScore($phrase_string); + $this->assertTrue(($score < 0.025), "Harder Safe Test 2"); } }