diff --git a/configs/config.php b/configs/config.php
index 3f9853ae7..9e2a2fe49 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -244,6 +244,7 @@ $INDEXED_FILE_TYPES =
"php",
"pl",
"ppt",
+ "pptx",
"png",
"rtf",
"rss",
@@ -277,6 +278,8 @@ $PAGE_PROCESSORS = array( "text/html" => "HtmlProcessor",
"application/msword" => "DocProcessor",
"application/vnd.ms-powerpoint" => "PptProcessor",
+ "application/vnd.openxmlformats-officedocument.
+ presentationml.presentation"=> "PptxProcessor",
"application/epub+zip" => "EpubProcessor",
"text/rtf" => "RtfProcessor",
diff --git a/lib/processors/pptx_processor.php b/lib/processors/pptx_processor.php
new file mode 100644
index 000000000..ad430f6fc
--- /dev/null
+++ b/lib/processors/pptx_processor.php
@@ -0,0 +1,265 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Nakul Natu nakul.natu@gmail.com
+ * @package seek_quarry
+ * @subpackage processor
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010, 2011
+ * @filesource
+ */
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * Load base class, if needed.
+ */
+require_once BASE_DIR."/lib/processors/text_processor.php";
+/**
+ * Load so can parse urls
+ */
+require_once BASE_DIR."/lib/url_parser.php";
+
+/**
+ * Used to create crawl summary information
+ * for PPTX files
+ *
+ * @author Nakul Natu
+ * @package seek_quarry
+ * @subpackage processor
+ */
+class PptxProcessor extends TextProcessor
+{
+ /**
+ * Constant for maximum description length
+ * @var number
+ */
+ const MAX_DESCRIPTION_LEN = 2000;
+
+ /**
+ * Used to extract the title, description and links from
+ * a pptx file consisting of xml data.
+ *
+ * @param string $page pptx(zip) contents
+ * @param string $url the url where the page contents came from,
+ * used to canonicalize relative links
+ *
+ * @return array a summary of the contents of the page
+ *
+ */
+ function process($page, $url)
+ {
+ $summary = NULL;
+ $sites = array();
+ // Create a temporary pptx file
+ $filename=CRAWL_DIR . "/pptx.zip";
+
+ file_put_contents($filename, $page);
+ // Open a zip archive
+ $zip = new ZipArchive;
+ if ($zip->open($filename) === TRUE) {
+ $buf= $zip->getFromName("docProps/core.xml");
+ if ($buf){
+ $dom = self::dom($buf);
+ if($dom !== false) {
+ // Get the title
+ $summary[self::TITLE] = self::title($dom);
+ }
+ }
+ $buf= $zip->getFromName("docProps/app.xml");
+ if($buf){
+ // Get number of slides present
+ $dom = self::dom($buf);
+ $slides=self::slides($dom);
+ }
+
+ $summary[self::DESCRIPTION]="";
+ $summary[self::LINKS]=$sites;
+ for ($i = 1; $i <= $slides; $i++) {
+ $buf=$zip->getFromName("ppt/slides/slide" . $i . ".xml");
+ if($buf){
+ /* Get description , language and url links asociated
+ * with each slide*/
+ $dom = self::dom($buf);
+ $desc=self::description($dom);
+
+ if(strlen($summary[self::DESCRIPTION])
+ < self::MAX_DESCRIPTION_LEN) {
+ $summary[self::DESCRIPTION]=
+ $summary[self::DESCRIPTION].$desc;
+ }
+ $lang1=self::lang($dom);
+ if($lang1){
+ $summary[self::LANG]=$lang1;
+ }
+ $summary[self::LINKS]=array_merge($summary[self::LINKS]
+ ,self::links($dom,$url));
+ }
+ }
+ // Close the zip
+ $zip->close();
+ // Delete zip from the directory
+ deleteFileOrDir($filename);
+ }
+ else{
+ // If not pptx then process it as a text file
+ $summary=parent::process($page, $url);
+ }
+
+ return $summary;
+ }
+
+ /**
+ * Returns up to MAX_LINK_PER_PAGE many links from the supplied
+ * dom object where links have been canonicalized according to
+ * the supplied $site information.
+ *
+ * @param object $dom a document object with links on it
+ * @param string $site a string containing a url
+ *
+ * @return array links from the $dom object
+ */
+ static function links($dom, $site)
+ {
+ $sites = array();
+
+ $xpath = new DOMXPath($dom);
+ $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//
+ p:txBody//a:p//a:r//a:rPr//a:hlinkClick");
+
+ $i=0;
+
+ foreach($paras as $para) {
+ if($i < MAX_LINKS_PER_PAGE) {
+ $hlink=$para->parentNode->parentNode->
+ getElementsByTagName("t")->item(0)->nodeValue;
+
+ $url=UrlParser::canonicalLink(
+ $hlink, $site);
+ if(!UrlParser::checkRecursiveUrl($url) &&
+ strlen($url) < MAX_URL_LENGTH) {
+ if(isset($sites[$url])) {
+ $sites[$url] .=" ".$hlink;
+ } else {
+ $sites[$url] = $hlink;
+ }
+ }
+ }
+ $i++;
+ }
+
+ return $sites;
+ }
+ /**
+ * Return a document object based on a string containing the contents of
+ * a web page
+ *
+ * @param string $page xml document
+ *
+ * @return object document object
+ */
+ static function dom($page)
+ {
+ $dom = new DOMDocument();
+
+ @$dom->loadXML($page);
+
+ return $dom;
+ }
+
+ /**
+ * Returns powerpoint head title of a pptx based on its document object
+ *
+ * @param object $dom a document object to extract a title from.
+ * @return string a title of the page
+ *
+ */
+ static function title($dom)
+ {
+ $coreProperties=$dom->getElementsByTagName("coreProperties");
+ foreach ($coreProperties as $property) {
+ $titles=$property->getElementsByTagName("title");
+ $title=$titles->item(0)->nodeValue;
+ return $title;
+ }
+ }
+
+ /**
+ * Returns number of slides of pptx based on its document object
+ *
+ * @param object $dom a document object to extract a title from.
+ * @return number number of slides
+ *
+ */
+ static function slides($dom)
+ {
+ $properties=$dom->getElementsByTagName("Properties");
+ foreach($properties as $property){
+ $slides=$property->getElementsByTagName("Slides");
+ $number=$slides->item(0)->nodeValue;
+ return $number;
+ }
+ }
+
+ /**
+ * Determines the language of the xml document by looking at the
+ * language attribute of a tag.
+ *
+ * @param object $dom a document object to check the language of
+ *
+ * @return string language tag for guessed language
+ */
+ static function lang($dom)
+ {
+ $xpath = new DOMXPath($dom);
+
+ $languages = $xpath->evaluate("/p:sld//p:cSld//p:spTree//
+ p:sp//p:txBody//a:p//a:r//a:rPr");
+ foreach ($languages as $language){
+ return $language->getAttribute("lang");
+ }
+ }
+
+ /**
+ * Returns descriptive text concerning a pptx slide based on its document
+ * object
+ *
+ * @param object $dom a document object to extract a description from.
+ * @return string a description of the slide
+ */
+ static function description($dom)
+ {
+ $xpath = new DOMXPath($dom);
+
+ $titles = $xpath->evaluate("/p:sld//p:cSld//p:spTree
+ //p:sp//p:txBody//a:p//a:r//a:t");
+ $description="";
+ foreach ($titles as $title){
+ $description=$description.$title->nodeValue;
+ }
+ return $description;
+ }
+
+}
+?>
\ No newline at end of file
diff --git a/tests/pptx_processor_test.php b/tests/pptx_processor_test.php
new file mode 100644
index 000000000..a43943ee8
--- /dev/null
+++ b/tests/pptx_processor_test.php
@@ -0,0 +1,126 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * END LICENSE
+ *
+ * @author Nakul Natu nakul.natu@gmail.com
+ * @package seek_quarry
+ * @subpackage test
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010, 2011
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/** Load search engine-wide configuration file */
+require_once BASE_DIR.'/configs/config.php';
+foreach(glob(BASE_DIR."/lib/processors/*_processor.php") as $filename) {
+ require_once $filename;
+}
+
+
+/**
+ * UnitTest for the PptxProcessor class. It is used to process
+ * pptx files which are xml based zip format
+ *
+ * @author Nakul Natu
+ * @package seek_quarry
+ * @subpackage test
+ */
+class PptxProcessorTest extends UnitTest implements CrawlConstants
+{
+ /**
+ * Creates a summary of pptx document to check
+ */
+ public function setUp()
+ {
+ $processors="PptxProcessor";
+ $processor= new $processors();
+ $filename=BASE_DIR . "/tests/test_files/test.pptx";
+ $page=file_get_contents($filename);
+ $url="";
+ $summary=array();
+ $summary=$processor->process($page,$url);
+ $this->test_objects['summary'] = $summary;
+ }
+
+ /**
+ * Test object is set to null
+ */
+ public function tearDown()
+ {
+ $this->test_objects=null;
+ }
+ /**
+ * Checks title of the pptx is correct or not
+ */
+ public function checkTitleTestCase()
+ {
+ $objects = $this->test_objects['summary'];
+ $title="Nakul Natu";
+ $this->assertEqual
+ ($objects[self::TITLE], $title,"Correct Title Retrived");
+ }
+
+ /**
+ * Checks Language of pptx is correct or not
+ */
+ public function checkLangTestCase()
+ {
+ $objects = $this->test_objects['summary'];
+ $lang="en-US";
+ $this->assertEqual(
+ $objects[self::LANG], $lang,"Correct Language Retrived");
+ }
+
+ /**
+ * Checks the links are correct or not
+ */
+ public function checkLinksTestCase()
+ {
+ $objects = $this->test_objects['summary'];
+ $testLinks=array();
+ $testLinks[0]="http://www.google.com/";
+ $testLinks[1]="http://www.facebook.com/";
+ $links=array();
+ $links=$objects[self::LINKS];
+ $i=0;
+ foreach ($links as $link){
+ $this->assertEqual(
+ $link, $testLinks[$i],"Correct Link Retrived");
+ $i++;
+ }
+ }
+
+ /**
+ * Checks if description is not null
+ */
+ Public function checkDescriptionTestCase()
+ {
+ $objects = $this->test_objects['summary'];
+ $this->assertTrue(
+ isset($objects[self::DESCRIPTION]),"Description is not null");
+ }
+}
+?>
\ No newline at end of file
diff --git a/tests/test_files/test.pptx b/tests/test_files/test.pptx
new file mode 100644
index 000000000..0673d362e
Binary files /dev/null and b/tests/test_files/test.pptx differ