Exit Indexer if dictionary updater isn't alive when should be, a=chris

Chris Pollett [2022-08-10 00:Aug:th]
Exit Indexer if dictionary updater isn't alive when should be, a=chris
Filename
src/executables/ArcTool.php
src/executables/DictionaryUpdater.php
src/executables/QueueServer.php
src/library/PhraseParser.php
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 22b2c58a2..7fd1122e4 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -1202,6 +1202,13 @@ EOD;
                     $fp = fopen($dictionary_log, "w");
                     fclose($fp);
                 }
+                clearstatcache();
+                if (time() > filemtime($next_partition_path) +
+                    5 * C\LOG_TIMEOUT) {
+                    L\crawlLog("DictionaryUpdater seems to have crashed, ".
+                        "exiting ArcTool");
+                    exit();
+                }
                 sleep(15);
             }
             $next_partition =
diff --git a/src/executables/DictionaryUpdater.php b/src/executables/DictionaryUpdater.php
index 6f2c60f8b..03beda29c 100644
--- a/src/executables/DictionaryUpdater.php
+++ b/src/executables/DictionaryUpdater.php
@@ -114,7 +114,7 @@ class DictionaryUpdater implements CrawlConstants
                 $index_archive->next_partition_to_add);
         }
         //note the false parameter means will only update for one partition
-        $index_archive->updateDictionary(null, false);
+        $index_archive->updateDictionary($next_partition_path, false);
     }
     /**
      * Given a folder name, determines the kind of bundle (if any) it holds.
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 47d280abf..e18f4b0fe 100644
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1769,6 +1769,7 @@ class QueueServer implements CrawlConstants
         /* first condition below is to ensure not to call if still processing
            previous partition
          */
+        clearstatcache(); // to make sure filemtime below not cached
         if ($this->last_next_partition_to_add < $next_partition_to_add &&
             $next_partition_to_add < $save_partition) {
             $this->last_next_partition_to_add = $next_partition_to_add;
@@ -1777,6 +1778,12 @@ class QueueServer implements CrawlConstants
                 $options);
             CrawlDaemon::execScriptInOwnProcess(C\BASE_DIR .
                 "/executables/DictionaryUpdater.php", $options);
+        } else if ($this->last_next_partition_to_add ==
+            $next_partition_to_add && $next_partition_to_add < $save_partition
+            && time() > filemtime($next_partition_file) + 5 * C\LOG_TIMEOUT) {
+            L\crawlLog("DictionaryUpdater seems to have crashed, exiting ".
+                "Indexer");
+            exit();
         }
         L\garbageCollect();
     }
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index b9bb938e5..93a53ba9e 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -86,7 +86,7 @@ class PhraseParser
     /**
      * Threshold to use for a string to be conisdered "safe" (not X-rated)
      */
-    const SAFE_PHRASE_THRESHOLD = 0.03;
+    const SAFE_PHRASE_THRESHOLD = 0.035;
     /**
      * Converts a summary of a web page into a string of space separated words
      *
@@ -1464,8 +1464,8 @@ class PhraseParser
             "lesbian|sadomasochism|bondage|fisting|erotic|vagina|Tribadism|" .
             "penis|facial|hermaphrodite|transsexual|tranny|bestiality|snuff|" .
             "boob|fondle|tit|blowjob|lap|cock|dick|hardcore|pr0n|fuck|pussy|" .
-            "penetration|ass|cunt|bisexual|prostitution|screw|ass|melon|" .
-            "masturbation|clitoris|clit|suck|whore|bitch|cuckold|porn|" .
+            "penetration|ass|cunt|bisexual|prostitution|screw|ass|swinging|" .
+            "masturbation|clitoris|clit|suck|whore|bitch|cuckold|porn|melon|" .
             "femdom|exhibitionism|bellaco|cachar|chingar|shimar|chinquechar|" .
             "chichar|clavar|coger|culear|hundir|joder|mámalo|singar|cojon|" .
             "carajo|caray|bicho|concha|chucha|chocha|chuchamadre|coño|" .
@@ -1482,6 +1482,8 @@ class PhraseParser
             "那话儿|那話兒|屄|鸡白|雞白|阴道|陰道|阴户|陰戶|大姨妈|淫蟲|老嫖|妓女|" .
             "臭婊子|卖豆腐|賣豆腐|咪咪|大豆腐|爆乳|肏操|炒饭|炒飯|cặc|lồn|kaltak|" .
             "orospu|siktir|sıçmak|amcık";
+        static $boundary_regex = "";
+        static $no_boundary_regex = "";
         /* took keywords from top level domains from some of theporndude list
          */
         static $unsafe_url_regex = "/porn|xvideos|livejasmin|".
@@ -1513,6 +1515,10 @@ class PhraseParser
             "adam4adam|cams\.com|mrskin|adultwork|oglaf|streamate|".
             "nifty\.org|adultdvd|suicidegirls|ftvgirls|asstr|private\.com|".
             "squirt\.org|fakku|faapy|fux|txxx|\Wnude\W/i";
+        if (empty($boundary_regex)) {
+            $boundary_regex = "/\b$pre_unsafe_regex\b/ui";
+            $no_boundary_regex = "/$pre_unsafe_regex/ui";
+        }
         if (!empty($url) && preg_match($unsafe_url_regex, $url)) {
             return 1;
         }
@@ -1533,9 +1539,9 @@ class PhraseParser
          */
         if ($term_boundaries < ceil($len/8)) { //maybe text
             $term_boundaries = ceil($len/3);
-            $unsafe_regex = "/$pre_unsafe_regex/ui";
+            $unsafe_regex = $no_boundary_regex;
         } else {
-            $unsafe_regex = "/\b$pre_unsafe_regex\b/ui";
+            $unsafe_regex = $boundary_regex;
         }
         $match_count = preg_match_all($unsafe_regex, $phrase);
         $score = $match_count/$term_boundaries;
ViewGit