diff --git a/configs/config.php b/configs/config.php index e3bfc6319..3d9d2620d 100755 --- a/configs/config.php +++ b/configs/config.php @@ -235,7 +235,6 @@ $INDEXED_FILE_TYPES = "cfml", "csv", "doc", - "epub", "gif", "html", "htm", diff --git a/lib/processors/epub_processor.php b/lib/processors/epub_processor.php deleted file mode 100644 index ce7986796..000000000 --- a/lib/processors/epub_processor.php +++ /dev/null @@ -1,230 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Vijeth Patil vijeth.patil@gmail.com - * @package seek_quarry - * @subpackage processor - * @license http://www.gnu.org/licenses/ GPL3 - * @link http://www.seekquarry.com/ - * @copyright 2009, 2010, 2011 - * @filesource - */ - -if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} - -/** - * Load base class, if needed. - */ -require_once BASE_DIR."/lib/processors/text_processor.php"; - -/** - * If XML turns out to be XHTML ... - */ -require_once BASE_DIR."/lib/processors/html_processor.php"; - -/** - * Load so can parse urls - */ -require_once BASE_DIR."/lib/url_parser.php"; - - /** - * Used to create crawl summary information - * for XML files (those served as application/epub+zip) - * - * @author Vijeth Patil - * @package seek_quarry - * @subpackage processor - */ - -class EpubProcessor extends TextProcessor -{ - /** - * The name of the tag element in an xml document - * - * @var string name - */ - var $name; - - /** - * The attribute of the tag element in an xml document - * - * @var string attributes - */ - var $attributes; - - /** - * The content of the tag element or attribute, used to extract - * the fields like title, creator, language of the document - * - * @var string content - */ - var $content; - - /** - * The child tag element of a tag element. - * - * @var string children - */ - var $children; - - /** - * The maximum length of description - * - * @const integer MAX_DESCRIPTION_LEN - */ - const MAX_DESCRIPTION_LEN = 2000; - /** - * The processor will get the first this many files found in - * an .odf file and get the first this many elements from - * each of those files - * - * @const integer MAX_DOM_LEVEL - */ - const MAX_DOM_LEVEL = 10; - /** - * Used to extract the title, description and links from - * a string consisting of ebook publication data. - * - * @param string $page epub contents - * @param string $url the url where the page contents came from, - * used to canonicalize relative links - * - * @return array a summary of the contents of the page - * - */ - function process($page, $url) - { - $summary = NULL; - $opf_pattern = "/.opf$/i"; - $html_pattern = "/.html$/i"; - $xhtml_pattern = "/.xhtml$/i"; - $temp_filename = "epubzipfilename.zip"; - $epub_url = 0; - $epub_language = ''; - $epub_title = ''; - $epub_unique_identifier = ''; - $epub_author = ''; - - file_put_contents($temp_filename, $page); - $zip = new ZipArchive; - if($zip->open($temp_filename)) { - for($i = 0; $i < $zip->numFiles; $i++) { - // get the content file names of .epub document - $filename[$i] = $zip->getNameIndex($i) ; - if(preg_match($opf_pattern, $filename[$i])) { - // Get the file data from zipped folder - $opf_data = $zip->getFromName($filename[$i]); - $opf_summary = $this->xmlToObject($opf_data); - for($m = 0; $m <= self::MAX_DOM_LEVEL; $m++) { - for($n = 0;$n <= self::MAX_DOM_LEVEL; $n++) { - if(isset($opf_summary->children[$m]->children[$n])){ - $child = $opf_summary->children[$m]-> - children[$n]; - if( isset($child->name) && - $child->name == "dc:language") { - $epub_language = - $opf_summary->children[$m]-> - children[$n]->content ; - } - if( ($opf_summary->children[$m]->children[$n]-> - name) == "dc:title") { - $epub_title = $opf_summary->children[$m]-> - children[$n]->content; - } - if( ($opf_summary->children[$m]->children[$n]-> - name) == "dc:creator") { - $epub_author = $opf_summary->children[$m]-> - children[$n]->content ; - } - if( ($opf_summary->children[$m]->children[$n]-> - name) == "dc:identifier") { - $epub_unique_identifier = $opf_summary-> - children[$m]->children[$n]->content ; - } - } - } - } - }else if((preg_match($html_pattern,$filename[$i])) || - (preg_match($xhtml_pattern,$filename[$i]))) { - $html = new HtmlProcessor; - $html_data = $zip->getFromName($filename[$i]); - $description[$i] = $html->process($html_data,$url); - } - } - } - $summary[self::TITLE] = $epub_title; - $summary[self::DESCRIPTION] = $description; - $summary[self::LANG] = $epub_language; - $summary[self::LINKS] = $epub_url; - $summary[self::PAGE] = $page; - unlink($temp_filename); - return $summary; - } - - /** - * Used to extract the DOM tree containing the information - * about the epub file such as title, author, language, unique - * identifier of the book from a string consisting of ebook publication - * content OPF file. - * - * @param string $page xml contents - * - * @return array an information about the contents of the page - * - */ - function xmlToObject($xml) - { - $parser = xml_parser_create(); - xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0); - xml_parser_set_option($parser, XML_OPTION_SKIP_WHITE, 1); - xml_parse_into_struct($parser, $xml, $tags); - xml_parser_free($parser); - - $elements = array(); // the currently filling [child] XmlElement array - $stack = array(); - foreach ($tags as $tag) { - $index = count($elements); - if ($tag['type'] == "complete" || $tag['type'] == "open") { - $elements[$index] = new EpubProcessor; - $elements[$index]->name = $tag['tag']; - if(isset($tag['attributes'])) { - $elements[$index]->attributes = $tag['attributes']; - } - if(isset($tag['value'])) { - $elements[$index]->content = $tag['value']; - } - if ($tag['type'] == "open") { // push - $elements[$index]->children = array(); - $stack[] = &$elements; - $elements = &$elements[$index]->children; - } - } - if ($tag['type'] == "close") { // pop - $elements = array_pop($stack); - } - } - return $elements[0]; // the single top-level element - } -} -?> diff --git a/lib/processors/pptx_processor.php b/lib/processors/pptx_processor.php index 15a4b163a..a71186179 100644 --- a/lib/processors/pptx_processor.php +++ b/lib/processors/pptx_processor.php @@ -131,7 +131,6 @@ class PptxProcessor extends TextProcessor // If not pptx then process it as a text file $summary = parent::process($page, $url); } - return $summary; } diff --git a/lib/processors/xlsx_processor.php b/lib/processors/xlsx_processor.php index b156834e4..24c63fdf0 100644 --- a/lib/processors/xlsx_processor.php +++ b/lib/processors/xlsx_processor.php @@ -132,7 +132,6 @@ class XlsxProcessor extends TextProcessor $zip->close(); //delete the temporarily created file @unlink("$file_name"); - return $summary; } diff --git a/tests/epub_processor_test.php b/tests/epub_processor_test.php deleted file mode 100644 index 43a8736ef..000000000 --- a/tests/epub_processor_test.php +++ /dev/null @@ -1,130 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Vijeth Patil vijeth.patil@gmail.com - * @package seek_quarry - * @subpackage test - * @license http://www.gnu.org/licenses/ GPL3 - * @link http://www.seekquarry.com/ - * @copyright 2009, 2010, 2011 - * @filesource - */ - -if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} - -/** - * Load search engine-wide configuration file - */ -require_once BASE_DIR.'/configs/config.php'; - -/** - * Load the EpubProcessor class we are going to test - */ -require_once BASE_DIR."/lib/processors/epub_processor.php"; - -/** - * Load the base unit test class - */ -require_once BASE_DIR."/lib/unit_test.php"; - -/** - * Load the Crawl constants required for the summary - */ - require_once BASE_DIR."/lib/crawl_constants.php"; -/** - * UnitTest for the EpubProcessor class. An EpubProcessor is used to process - * a .epub (ebook publishing standard) file and extract summary from it. This - * class tests the processing of an .epub file format by EpubProcessor. - * - * - * @author Vijeth Patil - * @package seek_quarry - * @subpackage test - */ - -class EpubProcessorTest extends UnitTest implements CrawlConstants -{ - /** - * Creates a new EpubProcessor object so that - * we can process an .epub format file. - */ - - public function setUp() - { - $epub_object = new EpubProcessor; - $url = "http://www.manybooks.net/titles/aabergj2966629666-8.html"; - $filename= BASE_DIR."/tests/test_files/aabergj29666296668epub.epub"; - $page = file_get_contents($filename); - $summary=$epub_object->process($page,$url); - $this->test_objects['summary'] = $summary; - - } - - /** - * Delete any files associated with our test on EpubProcessor - */ - public function tearDown() - { - @unlink(""); - } - - /** - * Test case to check whether the title of the epub document - * is retrieved correctly. - */ - public function testEpubTitleTestCase() - { - $m = $this->test_objects['summary'] ; - $x = $m[self::TITLE]; - $correct_title = "Hymns and Hymnwriters of Denmark"; - $description = "Test Passed with correct title"; - $this->assertEqual($x,$correct_title,$description); - } - - /** - * Test case to check whether the language of the document is - * retrieved correctly. - */ - public function testEpubLangTestCase() - { - $m = $this->test_objects['summary'] ; - $x = $m[self::LANG]; - $correct_language = "en"; - $description = "Test Passed with correct Language"; - $this->assertEqual($x,$correct_language,$description); - } - - /** - * Test case to check whether the description of the document is - * not empty. - */ - public function testEpubDescriptionTestCase() - { - $m = $this->test_objects['summary'] ; - $x = $m[self::DESCRIPTION]; - $description = "Test Passed with Description information not empty"; - $this->assertTrue($x, $description); - } - - } diff --git a/tests/test_files/aabergj29666296668epub.epub b/tests/test_files/aabergj29666296668epub.epub deleted file mode 100644 index 2418f49a0..000000000 Binary files a/tests/test_files/aabergj29666296668epub.epub and /dev/null differ