Further attempt at more robust language detection, a=chris

Chris Pollett [2019-06-10 23:Jun:th]
Further attempt at more robust language detection, a=chris
Filename
src/executables/Fetcher.php
src/library/UrlParser.php
src/library/processors/HtmlProcessor.php
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 49a9847f0..576564086 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2931,8 +2931,7 @@ class Fetcher implements CrawlConstants
                     }
                 }
                 if (empty($site[self::LANG])) {
-                    $lang = L\guessLocaleFromString(
-                        $site[self::DESCRIPTION], C\DEFAULT_LOCALE);
+                    $lang = L\guessLocaleFromString($site[self::DESCRIPTION]);
                 } else {
                     $lang = $site[self::LANG];
                 }
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 8a64fde7a..e00dbf893 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -214,6 +214,7 @@ class UrlParser
             "tr" => 'tr',
             "tw" => 'zh-CN',
             "vi" => 'vi-VN',
+            "vn" => 'vi-VN',
             "cn" => 'zh-CN',
         ];
         $host = self::getHost($url, false);
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index 549df7bf3..9239876af 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -109,8 +109,9 @@ class HtmlProcessor extends TextProcessor
             $page = preg_replace('/\&nbsp\;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
                 ' ', $page);
             $page =
-                preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page);
-            $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ',
+                preg_replace('@<script[^>]*?>[\s\S]*?</script\s*>@si', ' ',
+                $page);
+            $dom_page = preg_replace('@<style[^>]*?>[\s\S]*?</style>@si', ' ',
                 $page);
             $dom = self::dom($dom_page);
             if ($dom !== false ) {
@@ -119,8 +120,8 @@ class HtmlProcessor extends TextProcessor
                 if ($summary[self::TITLE] == "") {
                     $summary[self::TITLE] = self::crudeTitle($dom_page);
                 }
-                $summary[self::LANG] = self::lang($dom,
-                    strip_tags($page), $url);
+                $summary[self::LANG] = self::lang($dom, strip_tags($dom_page),
+                    $url);
                 $description_dom = $dom;
                 if (!empty($scraper)) {
                     $scrape_results = ScraperManager::applyScraperRules(
@@ -239,48 +240,49 @@ class HtmlProcessor extends TextProcessor
                     '(\-[a-zA-Z][a-zA-Z])?)[\'|\"]?/', $item->nodeValue,
                     $match)) {
                     if (!empty($match[1])) {
-                        return $match[1];
+                        $lang = $match[1];
+                        if ($lang != 'en' && $lang != 'en-US') {
+                            return $lang;
+                        }
                     }
                 }
             }
         }
         $htmls = $dom->getElementsByTagName("html");
-        $lang = null;
+        $lang = (empty($lang)) ? null : $lang;
         foreach ($htmls as $html) {
             $lang = $html->getAttribute('lang');
-            if ($lang != null) {
+            if ($lang != null && $lang != 'en' && $lang != 'en-US') {
                 return $lang;
             }
         }
-        if ($lang == null) {
-            //baidu doesn't have a lang attribute but does say encoding
-            $xpath = new \DOMXPath($dom);
-            $charset_checks = ["contains(translate(@http-equiv,".
-                "'abcdefghijklmnopqrstuvwxyz'," .
-                " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-LANGUAGE')" => 0,
-                "contains(translate(@http-equiv,".
-                "'abcdefghijklmnopqrstuvwxyz'," .
-                " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-TYPE')" => 1];
-            foreach ($charset_checks as $charset_check => $index) {
-                $metas = $xpath->evaluate("/html/head//meta[$charset_check]");
-                $found_metas = [];
-                foreach ($metas as $meta) {
-                    $content = $meta->getAttribute('content');
-                    $charset_metas = explode("=", $content);
-                    if ($index == 0) {
-                        return $charset_metas[$index];
-                    }
-                    if (isset($charset_metas[$index])) {
-                        $charset = strtoupper($charset_metas[$index]);
-                        $lang = L\guessLangEncoding($charset);
-                        if ($lang != 'en') { //default is en, so keep checking
-                            return $lang;
-                        }
+        //baidu doesn't have a lang attribute but does say encoding
+        $xpath = new \DOMXPath($dom);
+        $charset_checks = ["contains(translate(@http-equiv,".
+            "'abcdefghijklmnopqrstuvwxyz'," .
+            " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-LANGUAGE')" => 0,
+            "contains(translate(@http-equiv,".
+            "'abcdefghijklmnopqrstuvwxyz'," .
+            " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-TYPE')" => 1];
+        foreach ($charset_checks as $charset_check => $index) {
+            $metas = $xpath->evaluate("/html/head//meta[$charset_check]");
+            $found_metas = [];
+            foreach ($metas as $meta) {
+                $content = $meta->getAttribute('content');
+                $charset_metas = explode("=", $content);
+                if ($index == 0) {
+                    return $charset_metas[$index];
+                }
+                if (isset($charset_metas[$index])) {
+                    $charset = strtoupper($charset_metas[$index]);
+                    $lang = L\guessLangEncoding($charset);
+                    if ($lang != 'en') { //default is en, so keep checking
+                        return $lang;
                     }
                 }
             }
-            $lang = self::calculateLang($sample_text, $url);
         }
+        $lang = self::calculateLang($sample_text, $url);
         return $lang;
     }
     /**
ViewGit