fix bug in token tool kwiki-seeds not matching multiple word seeds, a=chris

Chris Pollett [2022-07-11 07:Jul:th]
fix bug in token tool kwiki-seeds not matching multiple word seeds, a=chris
Filename
src/configs/TokenTool.php
src/library/CrawlQueueBundle.php
src/library/Utility.php
src/library/media_jobs/TrendingHighlightsJob.php
src/library/processors/HtmlProcessor.php
diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php
index 31102f2c6..b3e8673f7 100644
--- a/src/configs/TokenTool.php
+++ b/src/configs/TokenTool.php
@@ -744,7 +744,7 @@ function getTopPages($page_count_file, $locale_tag, $max_pages,
     list($fr, $read, $close) = smartOpen($page_count_file);
     $locale_tag = explode("-", $locale_tag)[0];
     $pattern = '/^' . $locale_tag .
-        "\.wikipedia\s([^\s\,\$\+\d\=\:\/\_\!\"\%]+)\s\d+\s[a-z]+\s(\d+)/u";
+        "\.wikipedia\s([^\s\,\$\+\d\=\:\/\!\"\%]+)\s\d+\s[^\d]+\s(\d+)/u";
     $bytes = 0;
     $bytes_since_last_output = 0;
     $input_buffer = "";
@@ -781,7 +781,8 @@ function getTopPages($page_count_file, $locale_tag, $max_pages,
             if (substr_count($title, "_") > 2 || strlen($title) > C\TITLE_LEN) {
                 continue;
             }
-            $title_counts[$title] = intval($page_counts);
+            $title_counts[$title] ??= 0; // so can add mobile and desktop counts
+            $title_counts[$title] += intval($page_counts);
             $buffer_size = max(4 * $max_pages, 40000);
             if (count($title_counts) > $buffer_size) {
                 echo  "..pruning results to $max_pages terms.\n";
diff --git a/src/library/CrawlQueueBundle.php b/src/library/CrawlQueueBundle.php
index fa23959af..96f87acc7 100644
--- a/src/library/CrawlQueueBundle.php
+++ b/src/library/CrawlQueueBundle.php
@@ -62,11 +62,18 @@ class CrawlQueueBundle
      */
     public $filter_size;
     /**
-     *
+     * Array of hosts for which a robots.txt file has just been received and
+     * processed for which urls from that host are still waiting to be notified
+     * for queueing.
+     * @var array
      */
     public $robot_notify_hosts;
     /**
-     *
+     * LinearHashTable of information about company level domains that have
+     * been crawled. Information includes number of SEEN_URLS, number of
+     * WEIGHTED_SEEN_URLS, number of WEIGHTED_INCOMING_URLS.
+     * (A company level domain is google.com or google.co.uk, but not
+     *  fo.la.google.com, www.google.com, foo.google.com or foo.google.co.uk)
      * @var LinearHashTable
      */
     public $domain_table;
@@ -125,9 +132,12 @@ class CrawlQueueBundle
      */
     const MAX_URL_FILE_SIZE = 1000000;
     /**
+     * When writing urls to robot_table, how many to buffer at a time and
+     * then bulk put.
      */
     const MAX_URL_BUFFER_BEFORE_WRITE = 500;
     /**
+     * File extension to used for files of serialized url data
      */
     const URL_FILES_EXTENSION = ".txt.gz";
     /**
diff --git a/src/library/Utility.php b/src/library/Utility.php
index 2f5512a70..3ea2568e6 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -2371,7 +2371,7 @@ function isPositiveInteger($input)
         preg_match("/^\d+$/", trim($input)) && intval($input) > 0);
 }
 /**
- * Used to measure the memory foot print and time spent calling a metho
+ * Used to measure the memory footprint in bytes and time spent calling a method
  * of an object. It also records number of time the method has been called.
  * Just calls the method without any recording or timing until
  * an initial call to the function measureCall(null, save_statistics_file)
@@ -2390,14 +2390,39 @@ function measureCall($object, $method, $arguments = [], $call_name = "")
     return measureObjectCall($object, $method, $arguments, $call_name);
 }
 /**
- *
+ * Used to measure the memory footprint of an object in Yioop and save
+ * it to a statistics file
+ * No recording is done until
+ * an initial call to the function measureCall(null, save_statistics_file)
+ * where save_statistics_file is the name of the file you won't to store
+ * statistics to.
+ * @param object $object name of object whose size we want to measure
+ * @param string $save_file statistics file to write info to
+ * @param string $call_name name of class to use for measuring. If empty
+ *  will call getClass to determine this
+ * @param bool true if legitimate arguments supplied
  */
-function measureObject($object, $save_file = "", $call_name = "")
+function measureObject($object, $save_file = "", $class_name = "")
 {
-    return measureObjectCall($object, $save_file, [], $call_name);
+    return measureObjectCall($object, $save_file, [], $class_name);
 }
 /**
+ * General method called by for @see measureCall and @see measureObject
+ * Used to measure the memory footprint in bytes of an object or memory and
+ * time spent calling a method  of an object. It also records number of time
+ * the method has been called. When used to call a method before initialization,
+ * just calls the method without any recording or timing. To initialize,
+ * an initial call to the function measureCall(null, save_statistics_file)
+ * where save_statistics_file is the name of the file you won't to store
+ * statistics to should be done.
  *
+ * @param object $object name of object whose method we want to call and measure
+ * @param string $method method we're calling
+ * @param array $argument arguments to be sent to the method
+ * @param string $call_name name to use when outputting stats for this call,
+ *  defaults to $method.
+ * @return mixed whatever method would normally returned when called as above
+
  */
 function measureObjectCall($object, $method, $arguments = [], $call_name = "")
 {
diff --git a/src/library/media_jobs/TrendingHighlightsJob.php b/src/library/media_jobs/TrendingHighlightsJob.php
index adf9d42f6..bb7f53448 100644
--- a/src/library/media_jobs/TrendingHighlightsJob.php
+++ b/src/library/media_jobs/TrendingHighlightsJob.php
@@ -110,7 +110,7 @@ class TrendingHighlightsJob extends MediaJob
         $this->prepareTasks();
     }
     /**
-     *
+     * (Run on name server)
      */
     public function prepareTasks()
     {
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index 6014edbfa..e531fa441 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -217,7 +217,15 @@ class HtmlProcessor extends TextProcessor
         return $summary;
     }
     /**
+     * Used to compute the favicon url for a web page.
      *
+     * @param object $dom document object model of the web page trying to
+     *   compute the favicon url for
+     * @param string $url of web page that $dom corresponds to. Used to
+     *   help compute favicon url if link to icon relative in $dom or
+     *   if non-present and guessing using hostname.
+     * @return string url of favicon for web page (empty string if couldn't
+     *   determine)
      */
     public static function favicon($dom, $url)
     {
ViewGit