Try to improve link text extracted by HtmlProcessor, a=chris
Try to improve link text extracted by HtmlProcessor, a=chris
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index e531fa441..55b25175f 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -264,6 +264,7 @@ class HtmlProcessor extends TextProcessor
*
* @param string $url of website that is currently being processed
* @param array $links associative array of $link_url => $link_text pairs
+ * @return array of important links for the url
*/
public static function computeTopLevelLinks($url, $links)
{
@@ -573,8 +574,6 @@ class HtmlProcessor extends TextProcessor
$sites = [];
$xpath = new \DOMXPath($dom);
$tokenizer = PhraseParser::getTokenizer($lang);
- $has_stopwords_remover = !empty($tokenizer) &&
- method_exists($tokenizer, "stopwordsRemover");
$base_refs = $xpath->evaluate("/html//base");
if ($base_refs->item(0)) {
$tmp_site = $base_refs->item(0)->getAttribute('href');
@@ -594,14 +593,10 @@ class HtmlProcessor extends TextProcessor
$len = strlen($url);
if (!UrlParser::checkRecursiveUrl($url) &&
$len < C\MAX_URL_LEN && $len > 4) {
- $text = $href->nodeValue ;
+ $text = $href->nodeValue;
$url_title = $href->getAttribute('title') ?? "";
- if ($has_stopwords_remover) {
- $useful_text = $tokenizer->stopwordsRemover($text);
- } else {
- $useful_text = $text;
- }
- $useful_text = preg_replace("/\.\.|\s/u", "",
+ $useful_text = $text;
+ $useful_text = preg_replace("/\.\.|\s+/u", "",
$useful_text);
if (mb_strlen($useful_text) < C\MIN_LINKS_TEXT_CHARS) {
$parent_node = $href->parentNode;