Add better error checking for malformed pptx files, a=chris
Add better error checking for malformed pptx files, a=chris
diff --git a/src/library/processors/PptxProcessor.php b/src/library/processors/PptxProcessor.php
index 20d1998b2..7cb3091a1 100644
--- a/src/library/processors/PptxProcessor.php
+++ b/src/library/processors/PptxProcessor.php
@@ -145,6 +145,10 @@ class PptxProcessor extends TextProcessor
$i=0;
foreach ($paras as $para) {
if ($i < C\MAX_LINKS_TO_EXTRACT) {
+ if (empty($para->parentNode->parentNode->
+ getElementsByTagName("t")->item(0)->nodeValue)) {
+ continue;
+ }
$hlink = $para->parentNode->parentNode->
getElementsByTagName("t")->item(0)->nodeValue;
$url = UrlParser::canonicalLink(
@@ -167,14 +171,16 @@ class PptxProcessor extends TextProcessor
* Return a document object based on a string containing the contents of
* a web page
*
- * @param string $page xml document
+ * @param string $page xml document
*
* @return object document object
*/
public static function dom($page)
{
$dom = new \DOMDocument();
+ restore_error_handler();
@$dom->loadXML($page);
+ set_error_handler(C\NS_LIB . "yioop_error_handler");
return $dom;
}
/**
@@ -186,10 +192,15 @@ class PptxProcessor extends TextProcessor
*/
public static function title($dom)
{
- $coreProperties = $dom->getElementsByTagName("coreProperties");
- $property = $coreProperties->item(0);
- $titles = $property->getElementsByTagName("title");
- $title = $titles->item(0)->nodeValue;
+ $core_properties = $dom->getElementsByTagName("coreProperties");
+ $title = "";
+ if (!empty($core_properties) && !empty($core_properties->item(0))) {
+ $property = $core_properties->item(0);
+ $titles = $property->getElementsByTagName("title");
+ if (!empty($titles) && !empty($titles->item(0))) {
+ $title = $titles->item(0)->nodeValue;
+ }
+ }
return $title;
}
/**
@@ -202,9 +213,14 @@ class PptxProcessor extends TextProcessor
public static function numSlides($dom)
{
$properties = $dom->getElementsByTagName("Properties");
- $property = $properties->item(0);
- $slides = $property->getElementsByTagName("Slides");
- $number = $slides->item(0)->nodeValue;
+ $number = 0;
+ if (!empty($properties) && !empty($properties->item(0))) {
+ $property = $properties->item(0);
+ $slides = $property->getElementsByTagName("Slides");
+ if (!empty($slides->item(0))) {
+ $number = $slides->item(0)->nodeValue;
+ }
+ }
return $number;
}
/**
@@ -220,7 +236,7 @@ class PptxProcessor extends TextProcessor
$xpath = new \DOMXPath($dom);
$languages = $xpath->evaluate("/p:sld//p:cSld//p:spTree//
p:sp//p:txBody//a:p//a:r//a:rPr");
- if (!$languages) {
+ if (empty($languages) || empty($languages->item(0))) {
return false;
}
return $languages->item(0)->getAttribute("lang");
@@ -239,6 +255,9 @@ class PptxProcessor extends TextProcessor
$description = "";
$len = 0;
foreach ($paragraphs as $paragraph) {
+ if (empty($paragraph)) {
+ continue;
+ }
$text = $paragraph->nodeValue."\n\n";
$text_len = strlen($text);
$len += $text_len;