Fixes a bug in UTF-8 handling in news feeds, a=chris
Fixes a bug in UTF-8 handling in news feeds, a=chris
diff --git a/src/models/SourceModel.php b/src/models/SourceModel.php
index ee7691690..5f7d7a8bc 100644
--- a/src/models/SourceModel.php
+++ b/src/models/SourceModel.php
@@ -499,7 +499,12 @@ class SourceModel extends ParallelModel
$page = preg_replace("@<@", "<", $page);
$page = preg_replace("@>@", ">", $page);
$page = preg_replace("@<!\[CDATA\[(.+?)\]\]>@", '$1', $page);
- @$dom->loadHTML($page);
+ // we also need a hack to make UTF-8 work correctly
+ @$dom->loadHTML('<?xml encoding="UTF-8">' . $page);
+ foreach ($dom->childNodes as $item)
+ if ($item->nodeType == XML_PI_NODE)
+ $dom->removeChild($item);
+ $dom->encoding = 'UTF-8';
}
L\crawlLog("...done. Extracting info about whole feed.");
$lang = "";