diff --git a/index.php b/index.php index 03e04e436..5e3705911 100755 --- a/index.php +++ b/index.php @@ -45,7 +45,7 @@ define("BASE_DIR", substr($_SERVER['SCRIPT_FILENAME'], 0,-strlen("index.php"))); */ require_once(BASE_DIR.'configs/config.php'); ini_set("memory_limit","500M"); -header("X-FRAME-OPTIONS: DENY"); //prevent click jacking +header("X-FRAME-OPTIONS: DENY"); //prevent click-jacking session_name(SESSION_NAME); session_start(); /** diff --git a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php index 04124505b..0712f0a4c 100644 --- a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php +++ b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php @@ -190,7 +190,7 @@ class WebArchiveBundleIterator implements CrawlConstants function reset() { $this->count = $this->archive->count; - $this->num_partitions = $this->archive->write_partition+1; + $this->num_partitions = $this->archive->write_partition + 1; $this->overall_index = 0; $this->end_of_iterator = ($this->overall_index >= $this->count) ? true : false; diff --git a/lib/processors/epub_processor.php b/lib/processors/epub_processor.php index 17ea378c4..966dcddd2 100644 --- a/lib/processors/epub_processor.php +++ b/lib/processors/epub_processor.php @@ -123,15 +123,15 @@ class EpubProcessor extends TextProcessor */ function process($page, $url) { - $summary = NULL; + $summary = NULL; $opf_pattern = "/.opf$/i"; $html_pattern = "/.html$/i"; $xhtml_pattern = "/.xhtml$/i"; $temp_filename = "epubzipfilename.zip"; - $epub_url = 0; + $epub_url = 0; $epub_language = ''; file_put_contents($temp_filename,$page); - $zip = new ZipArchive; + $zip = new ZipArchive; if ($zip->open($temp_filename)) { for($i = 0; $i < $zip->numFiles; $i++) @@ -143,13 +143,16 @@ class EpubProcessor extends TextProcessor // Get the file data from zipped folder $opf_data = $zip->getFromName($filename[$i]); $opf_summary = $this->xmlToObject($opf_data); - for($m = 0;$m <= 10;$m++) + for($m = 0;$m <= 10; $m++) { - for($n = 0;$n <= 10;$n++) - { - if(($opf_summary->children[$m]->children[$n]-> - name) == "dc:language") - { + if(!isset($opf_summary->children[$m])) continue; + for($n = 0;$n <= 10; $n++) { + if(!isset( + $opf_summary->children[$m]->children[$n])) + continue; + $child = $opf_summary->children[$m]->children[$n]; + if( isset($child->name) && + $child->name == "dc:language") { $epub_language = $opf_summary->children[$m]-> children[$n]->content ; } @@ -183,11 +186,11 @@ class EpubProcessor extends TextProcessor } } $summary[self::TITLE] = $epub_title; - $summary[self::DESCRIPTION] = $description; + $summary[self::DESCRIPTION] = $description; $summary[self::LANG] = $epub_language; $summary[self::LINKS] = $epub_url; $summary[self::PAGE] = $page; - unlink($temp_filename); + unlink($temp_filename); return $summary; } @@ -195,7 +198,7 @@ class EpubProcessor extends TextProcessor * Used to extract the title, author, language and links from * a string consisting of ebook publication data. * - * @param string $page xml contents + * @param string $page xml contents * * @return array an information about the contents of the page * diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index 9e052426d..27dc5f078 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -259,6 +259,12 @@ class HtmlProcessor extends TextProcessor $sites = array(); $xpath = new DOMXPath($dom); + $base_refs = $xpath->evaluate("/html//base"); + if($base_refs->item(0)) { + $tmp_site = $base_refs->item(0)->getAttribute('href'); + if(strlen($tmp_site) > 0) {$site = $tmp_site;} + } + $hrefs = $xpath->evaluate("/html/body//a"); $i = 0; @@ -326,7 +332,6 @@ class HtmlProcessor extends TextProcessor } - return $sites; } diff --git a/lib/processors/pptx_processor.php b/lib/processors/pptx_processor.php index ad430f6fc..926eb6b1a 100644 --- a/lib/processors/pptx_processor.php +++ b/lib/processors/pptx_processor.php @@ -80,7 +80,7 @@ class PptxProcessor extends TextProcessor $zip = new ZipArchive; if ($zip->open($filename) === TRUE) { $buf= $zip->getFromName("docProps/core.xml"); - if ($buf){ + if ($buf) { $dom = self::dom($buf); if($dom !== false) { // Get the title @@ -100,7 +100,7 @@ class PptxProcessor extends TextProcessor $buf=$zip->getFromName("ppt/slides/slide" . $i . ".xml"); if($buf){ /* Get description , language and url links asociated - * with each slide*/ + with each slide*/ $dom = self::dom($buf); $desc=self::description($dom); @@ -152,15 +152,15 @@ class PptxProcessor extends TextProcessor foreach($paras as $para) { if($i < MAX_LINKS_PER_PAGE) { - $hlink=$para->parentNode->parentNode-> + $hlink = $para->parentNode->parentNode-> getElementsByTagName("t")->item(0)->nodeValue; - $url=UrlParser::canonicalLink( + $url = UrlParser::canonicalLink( $hlink, $site); if(!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LENGTH) { if(isset($sites[$url])) { - $sites[$url] .=" ".$hlink; + $sites[$url] .= " ".$hlink; } else { $sites[$url] = $hlink; } @@ -262,4 +262,4 @@ class PptxProcessor extends TextProcessor } } -?> \ No newline at end of file +?> diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php index f5d35f100..d52305ff1 100644 --- a/lib/processors/sitemap_processor.php +++ b/lib/processors/sitemap_processor.php @@ -44,7 +44,7 @@ require_once BASE_DIR."/lib/url_parser.php"; /** * Used to create crawl summary information - * for RSS files + * for sitemap files * * @author Chris Pollett * @package seek_quarry diff --git a/models/locale_model.php b/models/locale_model.php index ac9b49ddf..bc56b18f6 100644 --- a/models/locale_model.php +++ b/models/locale_model.php @@ -682,11 +682,9 @@ EOT; } } } - - return $strings; closedir($dh); + return $strings; - return; } } ?> diff --git a/tests/epub_processor_test.php b/tests/epub_processor_test.php index 84dc12ce2..43a8736ef 100644 --- a/tests/epub_processor_test.php +++ b/tests/epub_processor_test.php @@ -78,9 +78,7 @@ class EpubProcessorTest extends UnitTest implements CrawlConstants $page = file_get_contents($filename); $summary=$epub_object->process($page,$url); $this->test_objects['summary'] = $summary; - $this->testEpubTitleTestCase(); - $this->testEpubLangTestCase(); - $this->testEpubDescriptionTestCase(); + } /** @@ -114,7 +112,7 @@ class EpubProcessorTest extends UnitTest implements CrawlConstants $x = $m[self::LANG]; $correct_language = "en"; $description = "Test Passed with correct Language"; - $this->assertEqual($x,$correct_language,$description); + $this->assertEqual($x,$correct_language,$description); } /** @@ -126,8 +124,7 @@ class EpubProcessorTest extends UnitTest implements CrawlConstants $m = $this->test_objects['summary'] ; $x = $m[self::DESCRIPTION]; $description = "Test Passed with Description information not empty"; - $this->assertTrue($x,$description); + $this->assertTrue($x, $description); } - - + }