diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 49a9847f0..576564086 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -2931,8 +2931,7 @@ class Fetcher implements CrawlConstants
}
}
if (empty($site[self::LANG])) {
- $lang = L\guessLocaleFromString(
- $site[self::DESCRIPTION], C\DEFAULT_LOCALE);
+ $lang = L\guessLocaleFromString($site[self::DESCRIPTION]);
} else {
$lang = $site[self::LANG];
}
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 8a64fde7a..e00dbf893 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -214,6 +214,7 @@ class UrlParser
"tr" => 'tr',
"tw" => 'zh-CN',
"vi" => 'vi-VN',
+ "vn" => 'vi-VN',
"cn" => 'zh-CN',
];
$host = self::getHost($url, false);
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index 549df7bf3..9239876af 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -109,8 +109,9 @@ class HtmlProcessor extends TextProcessor
$page = preg_replace('/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
' ', $page);
$page =
- preg_replace('@<script[^>]*?>.*?</script>@si', ' ', $page);
- $dom_page = preg_replace('@<style[^>]*?>.*?</style>@si', ' ',
+ preg_replace('@<script[^>]*?>[\s\S]*?</script\s*>@si', ' ',
+ $page);
+ $dom_page = preg_replace('@<style[^>]*?>[\s\S]*?</style>@si', ' ',
$page);
$dom = self::dom($dom_page);
if ($dom !== false ) {
@@ -119,8 +120,8 @@ class HtmlProcessor extends TextProcessor
if ($summary[self::TITLE] == "") {
$summary[self::TITLE] = self::crudeTitle($dom_page);
}
- $summary[self::LANG] = self::lang($dom,
- strip_tags($page), $url);
+ $summary[self::LANG] = self::lang($dom, strip_tags($dom_page),
+ $url);
$description_dom = $dom;
if (!empty($scraper)) {
$scrape_results = ScraperManager::applyScraperRules(
@@ -239,48 +240,49 @@ class HtmlProcessor extends TextProcessor
'(\-[a-zA-Z][a-zA-Z])?)[\'|\"]?/', $item->nodeValue,
$match)) {
if (!empty($match[1])) {
- return $match[1];
+ $lang = $match[1];
+ if ($lang != 'en' && $lang != 'en-US') {
+ return $lang;
+ }
}
}
}
}
$htmls = $dom->getElementsByTagName("html");
- $lang = null;
+ $lang = (empty($lang)) ? null : $lang;
foreach ($htmls as $html) {
$lang = $html->getAttribute('lang');
- if ($lang != null) {
+ if ($lang != null && $lang != 'en' && $lang != 'en-US') {
return $lang;
}
}
- if ($lang == null) {
- //baidu doesn't have a lang attribute but does say encoding
- $xpath = new \DOMXPath($dom);
- $charset_checks = ["contains(translate(@http-equiv,".
- "'abcdefghijklmnopqrstuvwxyz'," .
- " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-LANGUAGE')" => 0,
- "contains(translate(@http-equiv,".
- "'abcdefghijklmnopqrstuvwxyz'," .
- " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-TYPE')" => 1];
- foreach ($charset_checks as $charset_check => $index) {
- $metas = $xpath->evaluate("/html/head//meta[$charset_check]");
- $found_metas = [];
- foreach ($metas as $meta) {
- $content = $meta->getAttribute('content');
- $charset_metas = explode("=", $content);
- if ($index == 0) {
- return $charset_metas[$index];
- }
- if (isset($charset_metas[$index])) {
- $charset = strtoupper($charset_metas[$index]);
- $lang = L\guessLangEncoding($charset);
- if ($lang != 'en') { //default is en, so keep checking
- return $lang;
- }
+ //baidu doesn't have a lang attribute but does say encoding
+ $xpath = new \DOMXPath($dom);
+ $charset_checks = ["contains(translate(@http-equiv,".
+ "'abcdefghijklmnopqrstuvwxyz'," .
+ " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-LANGUAGE')" => 0,
+ "contains(translate(@http-equiv,".
+ "'abcdefghijklmnopqrstuvwxyz'," .
+ " 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),'CONTENT-TYPE')" => 1];
+ foreach ($charset_checks as $charset_check => $index) {
+ $metas = $xpath->evaluate("/html/head//meta[$charset_check]");
+ $found_metas = [];
+ foreach ($metas as $meta) {
+ $content = $meta->getAttribute('content');
+ $charset_metas = explode("=", $content);
+ if ($index == 0) {
+ return $charset_metas[$index];
+ }
+ if (isset($charset_metas[$index])) {
+ $charset = strtoupper($charset_metas[$index]);
+ $lang = L\guessLangEncoding($charset);
+ if ($lang != 'en') { //default is en, so keep checking
+ return $lang;
}
}
}
- $lang = self::calculateLang($sample_text, $url);
}
+ $lang = self::calculateLang($sample_text, $url);
return $lang;
}
/**