Fixes a bug in UTF-8 handling in news feeds, a=chris

Chris Pollett [2015-08-10 16:Aug:th]
Fixes a bug in UTF-8 handling in news feeds, a=chris
Filename
src/models/SourceModel.php
diff --git a/src/models/SourceModel.php b/src/models/SourceModel.php
index ee7691690..5f7d7a8bc 100644
--- a/src/models/SourceModel.php
+++ b/src/models/SourceModel.php
@@ -499,7 +499,12 @@ class SourceModel extends ParallelModel
                 $page = preg_replace("@&lt;@", "<", $page);
                 $page = preg_replace("@&gt;@", ">", $page);
                 $page = preg_replace("@<!\[CDATA\[(.+?)\]\]>@", '$1', $page);
-                @$dom->loadHTML($page);
+                // we also need a hack to make UTF-8 work correctly
+                @$dom->loadHTML('<?xml encoding="UTF-8">' . $page);
+                foreach ($dom->childNodes as $item)
+                if ($item->nodeType == XML_PI_NODE)
+                    $dom->removeChild($item);
+                $dom->encoding = 'UTF-8';
             }
             L\crawlLog("...done. Extracting info about whole feed.");
             $lang = "";
ViewGit