Exit Indexer if dictionary updater isn't alive when should be, a=chris
Exit Indexer if dictionary updater isn't alive when should be, a=chris
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 22b2c58a2..7fd1122e4 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -1202,6 +1202,13 @@ EOD;
$fp = fopen($dictionary_log, "w");
fclose($fp);
}
+ clearstatcache();
+ if (time() > filemtime($next_partition_path) +
+ 5 * C\LOG_TIMEOUT) {
+ L\crawlLog("DictionaryUpdater seems to have crashed, ".
+ "exiting ArcTool");
+ exit();
+ }
sleep(15);
}
$next_partition =
diff --git a/src/executables/DictionaryUpdater.php b/src/executables/DictionaryUpdater.php
index 6f2c60f8b..03beda29c 100644
--- a/src/executables/DictionaryUpdater.php
+++ b/src/executables/DictionaryUpdater.php
@@ -114,7 +114,7 @@ class DictionaryUpdater implements CrawlConstants
$index_archive->next_partition_to_add);
}
//note the false parameter means will only update for one partition
- $index_archive->updateDictionary(null, false);
+ $index_archive->updateDictionary($next_partition_path, false);
}
/**
* Given a folder name, determines the kind of bundle (if any) it holds.
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 47d280abf..e18f4b0fe 100644
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1769,6 +1769,7 @@ class QueueServer implements CrawlConstants
/* first condition below is to ensure not to call if still processing
previous partition
*/
+ clearstatcache(); // to make sure filemtime below not cached
if ($this->last_next_partition_to_add < $next_partition_to_add &&
$next_partition_to_add < $save_partition) {
$this->last_next_partition_to_add = $next_partition_to_add;
@@ -1777,6 +1778,12 @@ class QueueServer implements CrawlConstants
$options);
CrawlDaemon::execScriptInOwnProcess(C\BASE_DIR .
"/executables/DictionaryUpdater.php", $options);
+ } else if ($this->last_next_partition_to_add ==
+ $next_partition_to_add && $next_partition_to_add < $save_partition
+ && time() > filemtime($next_partition_file) + 5 * C\LOG_TIMEOUT) {
+ L\crawlLog("DictionaryUpdater seems to have crashed, exiting ".
+ "Indexer");
+ exit();
}
L\garbageCollect();
}
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index b9bb938e5..93a53ba9e 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -86,7 +86,7 @@ class PhraseParser
/**
* Threshold to use for a string to be conisdered "safe" (not X-rated)
*/
- const SAFE_PHRASE_THRESHOLD = 0.03;
+ const SAFE_PHRASE_THRESHOLD = 0.035;
/**
* Converts a summary of a web page into a string of space separated words
*
@@ -1464,8 +1464,8 @@ class PhraseParser
"lesbian|sadomasochism|bondage|fisting|erotic|vagina|Tribadism|" .
"penis|facial|hermaphrodite|transsexual|tranny|bestiality|snuff|" .
"boob|fondle|tit|blowjob|lap|cock|dick|hardcore|pr0n|fuck|pussy|" .
- "penetration|ass|cunt|bisexual|prostitution|screw|ass|melon|" .
- "masturbation|clitoris|clit|suck|whore|bitch|cuckold|porn|" .
+ "penetration|ass|cunt|bisexual|prostitution|screw|ass|swinging|" .
+ "masturbation|clitoris|clit|suck|whore|bitch|cuckold|porn|melon|" .
"femdom|exhibitionism|bellaco|cachar|chingar|shimar|chinquechar|" .
"chichar|clavar|coger|culear|hundir|joder|mámalo|singar|cojon|" .
"carajo|caray|bicho|concha|chucha|chocha|chuchamadre|coño|" .
@@ -1482,6 +1482,8 @@ class PhraseParser
"那话儿|那話兒|屄|鸡白|雞白|阴道|陰道|阴户|陰戶|大姨妈|淫蟲|老嫖|妓女|" .
"臭婊子|卖豆腐|賣豆腐|咪咪|大豆腐|爆乳|肏操|炒饭|炒飯|cặc|lồn|kaltak|" .
"orospu|siktir|sıçmak|amcık";
+ static $boundary_regex = "";
+ static $no_boundary_regex = "";
/* took keywords from top level domains from some of theporndude list
*/
static $unsafe_url_regex = "/porn|xvideos|livejasmin|".
@@ -1513,6 +1515,10 @@ class PhraseParser
"adam4adam|cams\.com|mrskin|adultwork|oglaf|streamate|".
"nifty\.org|adultdvd|suicidegirls|ftvgirls|asstr|private\.com|".
"squirt\.org|fakku|faapy|fux|txxx|\Wnude\W/i";
+ if (empty($boundary_regex)) {
+ $boundary_regex = "/\b$pre_unsafe_regex\b/ui";
+ $no_boundary_regex = "/$pre_unsafe_regex/ui";
+ }
if (!empty($url) && preg_match($unsafe_url_regex, $url)) {
return 1;
}
@@ -1533,9 +1539,9 @@ class PhraseParser
*/
if ($term_boundaries < ceil($len/8)) { //maybe text
$term_boundaries = ceil($len/3);
- $unsafe_regex = "/$pre_unsafe_regex/ui";
+ $unsafe_regex = $no_boundary_regex;
} else {
- $unsafe_regex = "/\b$pre_unsafe_regex\b/ui";
+ $unsafe_regex = $boundary_regex;
}
$match_count = preg_match_all($unsafe_regex, $phrase);
$score = $match_count/$term_boundaries;