Backs out epub code as not stable, a=chris

Chris Pollett [2011-08-28 04:Aug:th]

Backs out epub code as not stable, a=chris

Filename
configs/config.php
lib/processors/epub_processor.php
lib/processors/pptx_processor.php
lib/processors/xlsx_processor.php
tests/epub_processor_test.php
tests/test_files/aabergj29666296668epub.epub

diff --git a/configs/config.php b/configs/config.php
index e3bfc6319..3d9d2620d 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -235,7 +235,6 @@ $INDEXED_FILE_TYPES =
             "cfml",
             "csv",
             "doc",
-	    "epub",
             "gif",
             "html",
             "htm",
diff --git a/lib/processors/epub_processor.php b/lib/processors/epub_processor.php
deleted file mode 100644
index ce7986796..000000000
--- a/lib/processors/epub_processor.php
+++ /dev/null
@@ -1,230 +0,0 @@
-<?php
-/**
- *  SeekQuarry/Yioop --
- *  Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- *  Copyright (C) 2009, 2010, 2011  Chris Pollett chris@pollett.org
- *
- *  LICENSE:
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  END LICENSE
- *
- * @author Vijeth Patil vijeth.patil@gmail.com
- * @package seek_quarry
- * @subpackage processor
- * @license http://www.gnu.org/licenses/ GPL3
- * @link http://www.seekquarry.com/
- * @copyright 2009, 2010, 2011
- * @filesource
- */
-
-if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
-
-/**
- * Load base class, if needed.
- */
-require_once BASE_DIR."/lib/processors/text_processor.php";
-
-/**
- * If XML turns out to be XHTML ...
- */
-require_once BASE_DIR."/lib/processors/html_processor.php";
-
-/**
- * Load so can parse urls
- */
-require_once BASE_DIR."/lib/url_parser.php";
-
- /**
- * Used to create crawl summary information
- * for XML files (those served as application/epub+zip)
- *
- * @author Vijeth Patil
- * @package seek_quarry
- * @subpackage processor
- */
-
-class EpubProcessor extends TextProcessor
-{
-    /**
-     *  The name of the tag element in an xml document
-     *
-     *  @var string name
-     */
-    var $name;
-
-    /**
-     *  The attribute of the tag element in an xml document
-     *
-     *  @var string attributes
-     */
-    var $attributes;
-
-    /**
-     *  The content of the tag element or attribute, used to extract
-     *  the fields like title, creator, language of the document
-     *
-     *  @var string content
-     */
-    var $content;
-
-    /**
-     *  The child tag element of a tag element.
-     *
-     *  @var string children
-     */
-    var $children;
-
-    /**
-     *  The maximum length of description
-     *
-     *  @const integer MAX_DESCRIPTION_LEN
-     */
-    const MAX_DESCRIPTION_LEN = 2000;
-    /**
-     * The processor will get the first this many files found in
-     * an .odf file and get the first this many elements from
-     * each of those files
-     *
-     *  @const integer MAX_DOM_LEVEL
-     */
-    const MAX_DOM_LEVEL = 10;
-    /**
-     *  Used to extract the title, description and links from
-     *  a string consisting of ebook publication data.
-     *
-     *  @param string $page epub contents
-     *  @param string $url the url where the page contents came from,
-     *     used to canonicalize relative links
-     *
-     *  @return array  a summary of the contents of the page
-     *
-     */
-    function process($page, $url)
-    {
-        $summary = NULL;
-        $opf_pattern = "/.opf$/i";
-        $html_pattern  = "/.html$/i";
-        $xhtml_pattern = "/.xhtml$/i";
-        $temp_filename = "epubzipfilename.zip";
-        $epub_url = 0;
-        $epub_language = '';
-        $epub_title = '';
-        $epub_unique_identifier = '';
-        $epub_author = '';
-
-        file_put_contents($temp_filename, $page);
-        $zip = new ZipArchive;
-        if($zip->open($temp_filename)) {
-            for($i = 0; $i < $zip->numFiles; $i++) {
-                // get the content file names of .epub document
-                $filename[$i] = $zip->getNameIndex($i) ;
-                if(preg_match($opf_pattern, $filename[$i])) {
-                    // Get the file data from zipped folder
-                    $opf_data = $zip->getFromName($filename[$i]);
-                    $opf_summary = $this->xmlToObject($opf_data);
-                    for($m = 0; $m <= self::MAX_DOM_LEVEL; $m++) {
-                        for($n = 0;$n <= self::MAX_DOM_LEVEL; $n++) {
-                            if(isset($opf_summary->children[$m]->children[$n])){
-                                $child = $opf_summary->children[$m]->
-                                    children[$n];
-                                if( isset($child->name) &&
-                                    $child->name == "dc:language") {
-                                    $epub_language =
-                                        $opf_summary->children[$m]->
-                                            children[$n]->content ;
-                                }
-                                if( ($opf_summary->children[$m]->children[$n]->
-                                    name) == "dc:title") {
-                                    $epub_title = $opf_summary->children[$m]->
-                                        children[$n]->content;
-                                }
-                                if( ($opf_summary->children[$m]->children[$n]->
-                                    name) == "dc:creator") {
-                                    $epub_author = $opf_summary->children[$m]->
-                                        children[$n]->content ;
-                                }
-                                if( ($opf_summary->children[$m]->children[$n]->
-                                    name) == "dc:identifier") {
-                                    $epub_unique_identifier = $opf_summary->
-                                        children[$m]->children[$n]->content ;
-                                }
-                            }
-                        }
-                    }
-                }else if((preg_match($html_pattern,$filename[$i])) ||
-                    (preg_match($xhtml_pattern,$filename[$i]))) {
-                    $html = new HtmlProcessor;
-                    $html_data = $zip->getFromName($filename[$i]);
-                    $description[$i] = $html->process($html_data,$url);
-                }
-            }
-        }
-        $summary[self::TITLE] = $epub_title;
-        $summary[self::DESCRIPTION] = $description;
-        $summary[self::LANG] = $epub_language;
-        $summary[self::LINKS] = $epub_url;
-        $summary[self::PAGE] = $page;
-        unlink($temp_filename);
-        return $summary;
-    }
-
-    /**
-     *  Used to extract the DOM tree containing the information
-     *  about the epub file such as title, author, language, unique
-     *  identifier of the book from a string consisting of ebook publication
-     *  content OPF file.
-     *
-     *  @param string $page xml contents
-     *
-     *  @return array  an information about the contents of the page
-     *
-     */
-    function xmlToObject($xml)
-    {
-        $parser = xml_parser_create();
-        xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
-        xml_parser_set_option($parser, XML_OPTION_SKIP_WHITE, 1);
-        xml_parse_into_struct($parser, $xml, $tags);
-        xml_parser_free($parser);
-
-        $elements = array();  // the currently filling [child] XmlElement array
-        $stack = array();
-        foreach ($tags as $tag) {
-            $index = count($elements);
-            if ($tag['type'] == "complete" || $tag['type'] == "open") {
-                $elements[$index] = new EpubProcessor;
-                $elements[$index]->name = $tag['tag'];
-                if(isset($tag['attributes'])) {
-                    $elements[$index]->attributes = $tag['attributes'];
-                }
-                if(isset($tag['value'])) {
-                    $elements[$index]->content = $tag['value'];
-                }
-                if ($tag['type'] == "open") {  // push
-                    $elements[$index]->children = array();
-                    $stack[] = &$elements;
-                    $elements = &$elements[$index]->children;
-                }
-            }
-            if ($tag['type'] == "close") {  // pop
-                $elements = array_pop($stack);
-            }
-        }
-        return $elements[0];  // the single top-level element
-    }
-}
-?>
diff --git a/lib/processors/pptx_processor.php b/lib/processors/pptx_processor.php
index 15a4b163a..a71186179 100644
--- a/lib/processors/pptx_processor.php
+++ b/lib/processors/pptx_processor.php
@@ -131,7 +131,6 @@ class PptxProcessor extends TextProcessor
         // If not pptx then process it as a text file
             $summary = parent::process($page, $url);
         }
-
         return $summary;
     }

diff --git a/lib/processors/xlsx_processor.php b/lib/processors/xlsx_processor.php
index b156834e4..24c63fdf0 100644
--- a/lib/processors/xlsx_processor.php
+++ b/lib/processors/xlsx_processor.php
@@ -132,7 +132,6 @@ class XlsxProcessor extends TextProcessor
         $zip->close();
         //delete the temporarily created file
         @unlink("$file_name");
-
         return $summary;
     }

diff --git a/tests/epub_processor_test.php b/tests/epub_processor_test.php
deleted file mode 100644
index 43a8736ef..000000000
--- a/tests/epub_processor_test.php
+++ /dev/null
@@ -1,130 +0,0 @@
-<?php
-/**
- *  SeekQuarry/Yioop --
- *  Open Source Pure PHP Search Engine, Crawler, and Indexer
- *
- *  Copyright (C) 2009, 2010, 2011  Chris Pollett chris@pollett.org
- *
- *  LICENSE:
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  END LICENSE
- *
- * @author Vijeth Patil vijeth.patil@gmail.com
- * @package seek_quarry
- * @subpackage test
- * @license http://www.gnu.org/licenses/ GPL3
- * @link http://www.seekquarry.com/
- * @copyright 2009, 2010, 2011
- * @filesource
- */
-
-if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
-
-/**
- *  Load search engine-wide configuration file
- */
-require_once BASE_DIR.'/configs/config.php';
-
-/**
- *  Load the EpubProcessor class we are going to test
- */
-require_once BASE_DIR."/lib/processors/epub_processor.php";
-
-/**
- *  Load the base unit test class
- */
-require_once BASE_DIR."/lib/unit_test.php";
-
-/**
- * Load the Crawl constants required for the summary
- */
- require_once BASE_DIR."/lib/crawl_constants.php";
-/**
- * UnitTest for the EpubProcessor class. An EpubProcessor is used to process
- * a .epub (ebook publishing standard) file and extract summary from it. This
- * class tests the processing of an .epub file format by EpubProcessor.
- *
- *
- * @author Vijeth Patil
- * @package seek_quarry
- * @subpackage test
- */
-
-class EpubProcessorTest extends UnitTest implements CrawlConstants
-{
-    /**
-     * Creates a new EpubProcessor object so that
-     * we can process an .epub format file.
-     */
-
-    public function setUp()
-    {
-        $epub_object = new EpubProcessor;
-        $url = "http://www.manybooks.net/titles/aabergj2966629666-8.html";
-        $filename= BASE_DIR."/tests/test_files/aabergj29666296668epub.epub";
-        $page = file_get_contents($filename);
-        $summary=$epub_object->process($page,$url);
-        $this->test_objects['summary'] = $summary;
-
-    }
-
-    /**
-     * Delete any files associated with our test on EpubProcessor
-     */
-    public function tearDown()
-    {
-        @unlink("");
-    }
-
-    /**
-     * Test case to check whether the title of the epub document
-     * is retrieved correctly.
-     */
-    public function testEpubTitleTestCase()
-    {
-        $m = $this->test_objects['summary'] ;
-        $x = $m[self::TITLE];
-        $correct_title = "Hymns and Hymnwriters of Denmark";
-        $description = "Test Passed with correct title";
-        $this->assertEqual($x,$correct_title,$description);
-    }
-
-    /**
-     * Test case to check whether the language of the document is
-     * retrieved correctly.
-     */
-    public function testEpubLangTestCase()
-    {
-        $m = $this->test_objects['summary'] ;
-        $x = $m[self::LANG];
-        $correct_language = "en";
-        $description = "Test Passed with correct Language";
-        $this->assertEqual($x,$correct_language,$description);
-    }
-
-    /**
-     * Test case to check whether the description of the document is
-     * not empty.
-     */
-    public function testEpubDescriptionTestCase()
-    {
-        $m = $this->test_objects['summary'] ;
-        $x = $m[self::DESCRIPTION];
-        $description = "Test Passed with Description information not empty";
-        $this->assertTrue($x, $description);
-    }
-
- }
diff --git a/tests/test_files/aabergj29666296668epub.epub b/tests/test_files/aabergj29666296668epub.epub
deleted file mode 100644
index 2418f49a0..000000000
Binary files a/tests/test_files/aabergj29666296668epub.epub and /dev/null differ

ViewGit