Merge https://seekquarry.com/git/yioop

Vijeth [2011-08-10 14:Aug:th]

Merge https://seekquarry.com/git/yioop

Filename
configs/config.php
lib/processors/pptx_processor.php
tests/pptx_processor_test.php
tests/test_files/test.pptx

diff --git a/configs/config.php b/configs/config.php
index 3f9853ae7..9e2a2fe49 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -244,6 +244,7 @@ $INDEXED_FILE_TYPES =
             "php",
             "pl",
             "ppt",
+            "pptx",
             "png",
             "rtf",
             "rss",
@@ -277,6 +278,8 @@ $PAGE_PROCESSORS = array(   "text/html" => "HtmlProcessor",

                             "application/msword" => "DocProcessor",
                             "application/vnd.ms-powerpoint" => "PptProcessor",
+                            "application/vnd.openxmlformats-officedocument.
+                                presentationml.presentation"=> "PptxProcessor",
 			    "application/epub+zip" => "EpubProcessor",

                             "text/rtf" => "RtfProcessor",
diff --git a/lib/processors/pptx_processor.php b/lib/processors/pptx_processor.php
new file mode 100644
index 000000000..ad430f6fc
--- /dev/null
+++ b/lib/processors/pptx_processor.php
@@ -0,0 +1,265 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010, 2011  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Nakul Natu nakul.natu@gmail.com
+ * @package seek_quarry
+ * @subpackage processor
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010, 2011
+ * @filesource
+ */
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * Load base class, if needed.
+ */
+require_once BASE_DIR."/lib/processors/text_processor.php";
+/**
+ * Load so can parse urls
+ */
+require_once BASE_DIR."/lib/url_parser.php";
+
+/**
+ * Used to create crawl summary information
+ * for PPTX files
+ *
+ * @author Nakul Natu
+ * @package seek_quarry
+ * @subpackage processor
+ */
+class PptxProcessor extends TextProcessor
+{
+    /**
+     * Constant for maximum description length
+     * @var number
+     */
+    const MAX_DESCRIPTION_LEN = 2000;
+
+    /**
+     *  Used to extract the title, description and links from
+     *  a pptx file consisting of xml data.
+     *
+     *  @param string $page pptx(zip) contents
+     *  @param string $url the url where the page contents came from,
+     *     used to canonicalize relative links
+     *
+     *  @return array  a summary of the contents of the page
+     *
+     */
+    function process($page, $url)
+    {
+        $summary = NULL;
+        $sites = array();
+        // Create a temporary pptx file
+        $filename=CRAWL_DIR . "/pptx.zip";
+
+        file_put_contents($filename, $page);
+        // Open a zip archive
+        $zip = new ZipArchive;
+        if ($zip->open($filename) === TRUE) {
+            $buf= $zip->getFromName("docProps/core.xml");
+            if ($buf){
+                $dom = self::dom($buf);
+                if($dom !== false) {
+                // Get the title
+                    $summary[self::TITLE] = self::title($dom);
+                }
+            }
+            $buf= $zip->getFromName("docProps/app.xml");
+            if($buf){
+            // Get number of slides present
+                $dom = self::dom($buf);
+                $slides=self::slides($dom);
+            }
+
+            $summary[self::DESCRIPTION]="";
+            $summary[self::LINKS]=$sites;
+            for ($i = 1; $i <= $slides; $i++) {
+                $buf=$zip->getFromName("ppt/slides/slide" . $i . ".xml");
+                if($buf){
+                /* Get description , language and url links asociated
+                 * with each slide*/
+                    $dom = self::dom($buf);
+                    $desc=self::description($dom);
+
+                    if(strlen($summary[self::DESCRIPTION])
+                        < self::MAX_DESCRIPTION_LEN) {
+                            $summary[self::DESCRIPTION]=
+                                $summary[self::DESCRIPTION].$desc;
+                    }
+                    $lang1=self::lang($dom);
+                    if($lang1){
+                        $summary[self::LANG]=$lang1;
+                    }
+                    $summary[self::LINKS]=array_merge($summary[self::LINKS]
+                        ,self::links($dom,$url));
+                }
+            }
+            // Close the zip
+            $zip->close();
+            // Delete zip from the directory
+            deleteFileOrDir($filename);
+        }
+        else{
+        // If not pptx then process it as a text file
+            $summary=parent::process($page, $url);
+        }
+
+        return $summary;
+    }
+
+    /**
+     * Returns up to MAX_LINK_PER_PAGE many links from the supplied
+     * dom object where links have been canonicalized according to
+     * the supplied $site information.
+     *
+     * @param object $dom   a document object with links on it
+     * @param string $site   a string containing a url
+     *
+     * @return array   links from the $dom object
+     */
+    static function links($dom, $site)
+    {
+        $sites = array();
+
+        $xpath = new DOMXPath($dom);
+        $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//
+            p:txBody//a:p//a:r//a:rPr//a:hlinkClick");
+
+        $i=0;
+
+        foreach($paras as $para) {
+            if($i < MAX_LINKS_PER_PAGE) {
+                $hlink=$para->parentNode->parentNode->
+                    getElementsByTagName("t")->item(0)->nodeValue;
+
+                $url=UrlParser::canonicalLink(
+                    $hlink, $site);
+                if(!UrlParser::checkRecursiveUrl($url)  &&
+                    strlen($url) < MAX_URL_LENGTH) {
+                    if(isset($sites[$url])) {
+                        $sites[$url] .=" ".$hlink;
+                    } else {
+                        $sites[$url] = $hlink;
+                    }
+                }
+            }
+            $i++;
+        }
+
+        return $sites;
+    }
+    /**
+     * Return a document object based on a string containing the contents of
+     * a web page
+     *
+     *  @param string $page   xml document
+     *
+     *  @return object  document object
+     */
+    static function dom($page)
+    {
+        $dom = new DOMDocument();
+
+        @$dom->loadXML($page);
+
+        return $dom;
+    }
+
+    /**
+     *  Returns powerpoint head title of a pptx based on its document object
+     *
+     *  @param object $dom   a document object to extract a title from.
+     *  @return string  a title of the page
+     *
+     */
+    static function title($dom)
+    {
+        $coreProperties=$dom->getElementsByTagName("coreProperties");
+        foreach ($coreProperties as $property) {
+            $titles=$property->getElementsByTagName("title");
+            $title=$titles->item(0)->nodeValue;
+            return $title;
+        }
+    }
+
+    /**
+     *  Returns number of slides of  pptx based on its document object
+     *
+     *  @param object $dom   a document object to extract a title from.
+     *  @return number  number of slides
+     *
+     */
+    static function slides($dom)
+    {
+        $properties=$dom->getElementsByTagName("Properties");
+        foreach($properties as $property){
+            $slides=$property->getElementsByTagName("Slides");
+            $number=$slides->item(0)->nodeValue;
+            return $number;
+        }
+    }
+
+    /**
+     *  Determines the language of the xml document by looking at the
+     *  language attribute of a tag.
+     *
+     *  @param object $dom  a document object to check the language of
+     *
+     *  @return string language tag for guessed language
+     */
+    static function lang($dom)
+    {
+        $xpath = new DOMXPath($dom);
+
+        $languages = $xpath->evaluate("/p:sld//p:cSld//p:spTree//
+            p:sp//p:txBody//a:p//a:r//a:rPr");
+        foreach ($languages as $language){
+            return $language->getAttribute("lang");
+        }
+    }
+
+    /**
+     * Returns descriptive text concerning a pptx slide based on its document
+     * object
+     *
+     * @param object $dom   a document object to extract a description from.
+     * @return string a description of the slide
+     */
+    static function description($dom)
+    {
+        $xpath = new DOMXPath($dom);
+
+        $titles = $xpath->evaluate("/p:sld//p:cSld//p:spTree
+            //p:sp//p:txBody//a:p//a:r//a:t");
+        $description="";
+        foreach ($titles as $title){
+            $description=$description.$title->nodeValue;
+        }
+        return $description;
+    }
+
+}
+?>
\ No newline at end of file
diff --git a/tests/pptx_processor_test.php b/tests/pptx_processor_test.php
new file mode 100644
index 000000000..a43943ee8
--- /dev/null
+++ b/tests/pptx_processor_test.php
@@ -0,0 +1,126 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010, 2011  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Nakul Natu nakul.natu@gmail.com
+ * @package seek_quarry
+ * @subpackage test
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010, 2011
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/** Load search engine-wide configuration file */
+require_once BASE_DIR.'/configs/config.php';
+foreach(glob(BASE_DIR."/lib/processors/*_processor.php") as $filename) {
+    require_once $filename;
+}
+
+
+/**
+ * UnitTest for the PptxProcessor class. It is used to process
+ * pptx files which are xml based zip format
+ *
+ * @author Nakul Natu
+ * @package seek_quarry
+ * @subpackage test
+ */
+class PptxProcessorTest extends UnitTest implements CrawlConstants
+{
+    /**
+     *  Creates a summary of pptx document to check
+     */
+    public function setUp()
+    {
+        $processors="PptxProcessor";
+        $processor= new $processors();
+        $filename=BASE_DIR . "/tests/test_files/test.pptx";
+        $page=file_get_contents($filename);
+        $url="";
+        $summary=array();
+        $summary=$processor->process($page,$url);
+        $this->test_objects['summary'] = $summary;
+    }
+
+    /**
+     * Test object is set to null
+     */
+    public function tearDown()
+    {
+        $this->test_objects=null;
+    }
+    /**
+     * Checks title of the pptx is correct or not
+     */
+    public function checkTitleTestCase()
+    {
+        $objects = $this->test_objects['summary'];
+        $title="Nakul Natu";
+        $this->assertEqual
+            ($objects[self::TITLE], $title,"Correct Title Retrived");
+    }
+
+    /**
+     * Checks Language of pptx is correct or not
+     */
+    public function checkLangTestCase()
+    {
+        $objects = $this->test_objects['summary'];
+        $lang="en-US";
+        $this->assertEqual(
+            $objects[self::LANG], $lang,"Correct Language Retrived");
+    }
+
+    /**
+     * Checks the links are correct or not
+     */
+    public function checkLinksTestCase()
+    {
+        $objects = $this->test_objects['summary'];
+        $testLinks=array();
+        $testLinks[0]="http://www.google.com/";
+        $testLinks[1]="http://www.facebook.com/";
+        $links=array();
+        $links=$objects[self::LINKS];
+        $i=0;
+        foreach ($links as $link){
+            $this->assertEqual(
+                $link, $testLinks[$i],"Correct Link Retrived");
+            $i++;
+        }
+    }
+
+    /**
+     * Checks if description is not null
+     */
+    Public function checkDescriptionTestCase()
+    {
+        $objects = $this->test_objects['summary'];
+        $this->assertTrue(
+            isset($objects[self::DESCRIPTION]),"Description is not null");
+    }
+}
+?>
\ No newline at end of file
diff --git a/tests/test_files/test.pptx b/tests/test_files/test.pptx
new file mode 100644
index 000000000..0673d362e
Binary files /dev/null and b/tests/test_files/test.pptx differ

ViewGit