diff --git a/src/configs/TokenTool.php b/src/configs/TokenTool.php
index 31102f2c6..b3e8673f7 100644
--- a/src/configs/TokenTool.php
+++ b/src/configs/TokenTool.php
@@ -744,7 +744,7 @@ function getTopPages($page_count_file, $locale_tag, $max_pages,
list($fr, $read, $close) = smartOpen($page_count_file);
$locale_tag = explode("-", $locale_tag)[0];
$pattern = '/^' . $locale_tag .
- "\.wikipedia\s([^\s\,\$\+\d\=\:\/\_\!\"\%]+)\s\d+\s[a-z]+\s(\d+)/u";
+ "\.wikipedia\s([^\s\,\$\+\d\=\:\/\!\"\%]+)\s\d+\s[^\d]+\s(\d+)/u";
$bytes = 0;
$bytes_since_last_output = 0;
$input_buffer = "";
@@ -781,7 +781,8 @@ function getTopPages($page_count_file, $locale_tag, $max_pages,
if (substr_count($title, "_") > 2 || strlen($title) > C\TITLE_LEN) {
continue;
}
- $title_counts[$title] = intval($page_counts);
+ $title_counts[$title] ??= 0; // so can add mobile and desktop counts
+ $title_counts[$title] += intval($page_counts);
$buffer_size = max(4 * $max_pages, 40000);
if (count($title_counts) > $buffer_size) {
echo "..pruning results to $max_pages terms.\n";
diff --git a/src/library/CrawlQueueBundle.php b/src/library/CrawlQueueBundle.php
index fa23959af..96f87acc7 100644
--- a/src/library/CrawlQueueBundle.php
+++ b/src/library/CrawlQueueBundle.php
@@ -62,11 +62,18 @@ class CrawlQueueBundle
*/
public $filter_size;
/**
- *
+ * Array of hosts for which a robots.txt file has just been received and
+ * processed for which urls from that host are still waiting to be notified
+ * for queueing.
+ * @var array
*/
public $robot_notify_hosts;
/**
- *
+ * LinearHashTable of information about company level domains that have
+ * been crawled. Information includes number of SEEN_URLS, number of
+ * WEIGHTED_SEEN_URLS, number of WEIGHTED_INCOMING_URLS.
+ * (A company level domain is google.com or google.co.uk, but not
+ * fo.la.google.com, www.google.com, foo.google.com or foo.google.co.uk)
* @var LinearHashTable
*/
public $domain_table;
@@ -125,9 +132,12 @@ class CrawlQueueBundle
*/
const MAX_URL_FILE_SIZE = 1000000;
/**
+ * When writing urls to robot_table, how many to buffer at a time and
+ * then bulk put.
*/
const MAX_URL_BUFFER_BEFORE_WRITE = 500;
/**
+ * File extension to used for files of serialized url data
*/
const URL_FILES_EXTENSION = ".txt.gz";
/**
diff --git a/src/library/Utility.php b/src/library/Utility.php
index 2f5512a70..3ea2568e6 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -2371,7 +2371,7 @@ function isPositiveInteger($input)
preg_match("/^\d+$/", trim($input)) && intval($input) > 0);
}
/**
- * Used to measure the memory foot print and time spent calling a metho
+ * Used to measure the memory footprint in bytes and time spent calling a method
* of an object. It also records number of time the method has been called.
* Just calls the method without any recording or timing until
* an initial call to the function measureCall(null, save_statistics_file)
@@ -2390,14 +2390,39 @@ function measureCall($object, $method, $arguments = [], $call_name = "")
return measureObjectCall($object, $method, $arguments, $call_name);
}
/**
- *
+ * Used to measure the memory footprint of an object in Yioop and save
+ * it to a statistics file
+ * No recording is done until
+ * an initial call to the function measureCall(null, save_statistics_file)
+ * where save_statistics_file is the name of the file you won't to store
+ * statistics to.
+ * @param object $object name of object whose size we want to measure
+ * @param string $save_file statistics file to write info to
+ * @param string $call_name name of class to use for measuring. If empty
+ * will call getClass to determine this
+ * @param bool true if legitimate arguments supplied
*/
-function measureObject($object, $save_file = "", $call_name = "")
+function measureObject($object, $save_file = "", $class_name = "")
{
- return measureObjectCall($object, $save_file, [], $call_name);
+ return measureObjectCall($object, $save_file, [], $class_name);
}
/**
+ * General method called by for @see measureCall and @see measureObject
+ * Used to measure the memory footprint in bytes of an object or memory and
+ * time spent calling a method of an object. It also records number of time
+ * the method has been called. When used to call a method before initialization,
+ * just calls the method without any recording or timing. To initialize,
+ * an initial call to the function measureCall(null, save_statistics_file)
+ * where save_statistics_file is the name of the file you won't to store
+ * statistics to should be done.
*
+ * @param object $object name of object whose method we want to call and measure
+ * @param string $method method we're calling
+ * @param array $argument arguments to be sent to the method
+ * @param string $call_name name to use when outputting stats for this call,
+ * defaults to $method.
+ * @return mixed whatever method would normally returned when called as above
+
*/
function measureObjectCall($object, $method, $arguments = [], $call_name = "")
{
diff --git a/src/library/media_jobs/TrendingHighlightsJob.php b/src/library/media_jobs/TrendingHighlightsJob.php
index adf9d42f6..bb7f53448 100644
--- a/src/library/media_jobs/TrendingHighlightsJob.php
+++ b/src/library/media_jobs/TrendingHighlightsJob.php
@@ -110,7 +110,7 @@ class TrendingHighlightsJob extends MediaJob
$this->prepareTasks();
}
/**
- *
+ * (Run on name server)
*/
public function prepareTasks()
{
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index 6014edbfa..e531fa441 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -217,7 +217,15 @@ class HtmlProcessor extends TextProcessor
return $summary;
}
/**
+ * Used to compute the favicon url for a web page.
*
+ * @param object $dom document object model of the web page trying to
+ * compute the favicon url for
+ * @param string $url of web page that $dom corresponds to. Used to
+ * help compute favicon url if link to icon relative in $dom or
+ * if non-present and guessing using hostname.
+ * @return string url of favicon for web page (empty string if couldn't
+ * determine)
*/
public static function favicon($dom, $url)
{