Try to improve link text extracted by HtmlProcessor, a=chris

Chris Pollett [2022-07-28 22:Jul:th]

Try to improve link text extracted by HtmlProcessor, a=chris

Filename
src/library/processors/HtmlProcessor.php

diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index e531fa441..55b25175f 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -264,6 +264,7 @@ class HtmlProcessor extends TextProcessor
      *
      * @param string $url of website that is currently being processed
      * @param array $links associative array of $link_url => $link_text pairs
+     * @return array of important links for the url
      */
     public static function computeTopLevelLinks($url, $links)
     {
@@ -573,8 +574,6 @@ class HtmlProcessor extends TextProcessor
         $sites = [];
         $xpath = new \DOMXPath($dom);
         $tokenizer = PhraseParser::getTokenizer($lang);
-        $has_stopwords_remover = !empty($tokenizer) &&
-            method_exists($tokenizer, "stopwordsRemover");
         $base_refs = $xpath->evaluate("/html//base");
         if ($base_refs->item(0)) {
             $tmp_site = $base_refs->item(0)->getAttribute('href');
@@ -594,14 +593,10 @@ class HtmlProcessor extends TextProcessor
                     $len = strlen($url);
                     if (!UrlParser::checkRecursiveUrl($url)  &&
                         $len < C\MAX_URL_LEN && $len > 4) {
-                        $text = $href->nodeValue ;
+                        $text = $href->nodeValue;
                         $url_title = $href->getAttribute('title') ?? "";
-                        if ($has_stopwords_remover) {
-                            $useful_text = $tokenizer->stopwordsRemover($text);
-                        } else {
-                            $useful_text = $text;
-                        }
-                        $useful_text = preg_replace("/\.\.|\s/u", "",
+                        $useful_text = $text;
+                        $useful_text = preg_replace("/\.\.|\s+/u", "",
                             $useful_text);
                         if (mb_strlen($useful_text) < C\MIN_LINKS_TEXT_CHARS) {
                             $parent_node = $href->parentNode;

ViewGit