Add better error checking for malformed pptx files, a=chris

Chris Pollett [2016-09-29 23:Sep:th]
Add better error checking for malformed pptx files, a=chris
Filename
src/library/processors/PptxProcessor.php
diff --git a/src/library/processors/PptxProcessor.php b/src/library/processors/PptxProcessor.php
index 20d1998b2..7cb3091a1 100644
--- a/src/library/processors/PptxProcessor.php
+++ b/src/library/processors/PptxProcessor.php
@@ -145,6 +145,10 @@ class PptxProcessor extends TextProcessor
         $i=0;
         foreach ($paras as $para) {
             if ($i < C\MAX_LINKS_TO_EXTRACT) {
+                if (empty($para->parentNode->parentNode->
+                    getElementsByTagName("t")->item(0)->nodeValue)) {
+                    continue;
+                }
                 $hlink = $para->parentNode->parentNode->
                     getElementsByTagName("t")->item(0)->nodeValue;
                 $url = UrlParser::canonicalLink(
@@ -167,14 +171,16 @@ class PptxProcessor extends TextProcessor
      * Return a document object based on a string containing the contents of
      * a web page
      *
-     * @param string $page   xml document
+     * @param string $page xml document
      *
      * @return object  document object
      */
     public static function dom($page)
     {
         $dom = new \DOMDocument();
+        restore_error_handler();
         @$dom->loadXML($page);
+        set_error_handler(C\NS_LIB . "yioop_error_handler");
         return $dom;
     }
     /**
@@ -186,10 +192,15 @@ class PptxProcessor extends TextProcessor
      */
     public static function title($dom)
     {
-        $coreProperties = $dom->getElementsByTagName("coreProperties");
-        $property = $coreProperties->item(0);
-        $titles = $property->getElementsByTagName("title");
-        $title = $titles->item(0)->nodeValue;
+        $core_properties = $dom->getElementsByTagName("coreProperties");
+        $title = "";
+        if (!empty($core_properties) && !empty($core_properties->item(0))) {
+            $property = $core_properties->item(0);
+            $titles = $property->getElementsByTagName("title");
+            if (!empty($titles) && !empty($titles->item(0))) {
+                $title = $titles->item(0)->nodeValue;
+            }
+        }
         return $title;
     }
     /**
@@ -202,9 +213,14 @@ class PptxProcessor extends TextProcessor
     public static function numSlides($dom)
     {
         $properties = $dom->getElementsByTagName("Properties");
-        $property = $properties->item(0);
-        $slides = $property->getElementsByTagName("Slides");
-        $number = $slides->item(0)->nodeValue;
+        $number = 0;
+        if (!empty($properties) && !empty($properties->item(0))) {
+            $property = $properties->item(0);
+            $slides = $property->getElementsByTagName("Slides");
+            if (!empty($slides->item(0))) {
+                $number = $slides->item(0)->nodeValue;
+            }
+        }
         return $number;
     }
     /**
@@ -220,7 +236,7 @@ class PptxProcessor extends TextProcessor
         $xpath = new \DOMXPath($dom);
         $languages = $xpath->evaluate("/p:sld//p:cSld//p:spTree//
             p:sp//p:txBody//a:p//a:r//a:rPr");
-        if (!$languages) {
+        if (empty($languages) || empty($languages->item(0))) {
             return false;
         }
         return $languages->item(0)->getAttribute("lang");
@@ -239,6 +255,9 @@ class PptxProcessor extends TextProcessor
         $description = "";
         $len = 0;
         foreach ($paragraphs as $paragraph) {
+            if (empty($paragraph)) {
+                continue;
+            }
             $text = $paragraph->nodeValue."\n\n";
             $text_len = strlen($text);
             $len += $text_len;
ViewGit