Support added for base tag in html processor, a=chris

Chris Pollett [2011-08-11 04:Aug:th]

Support added for base tag in html processor, a=chris

Filename
index.php
lib/archive_bundle_iterators/web_archive_bundle_iterator.php
lib/processors/epub_processor.php
lib/processors/html_processor.php
lib/processors/pptx_processor.php
lib/processors/sitemap_processor.php
models/locale_model.php
tests/epub_processor_test.php

diff --git a/index.php b/index.php
index 03e04e436..5e3705911 100755
--- a/index.php
+++ b/index.php
@@ -45,7 +45,7 @@ define("BASE_DIR", substr($_SERVER['SCRIPT_FILENAME'], 0,-strlen("index.php")));
  */
 require_once(BASE_DIR.'configs/config.php');
 ini_set("memory_limit","500M");
-header("X-FRAME-OPTIONS: DENY"); //prevent click jacking
+header("X-FRAME-OPTIONS: DENY"); //prevent click-jacking
 session_name(SESSION_NAME);
 session_start();
 /**
diff --git a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
index 04124505b..0712f0a4c 100644
--- a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php
@@ -190,7 +190,7 @@ class WebArchiveBundleIterator implements CrawlConstants
     function reset()
     {
         $this->count = $this->archive->count;
-        $this->num_partitions = $this->archive->write_partition+1;
+        $this->num_partitions = $this->archive->write_partition + 1;
         $this->overall_index = 0;
         $this->end_of_iterator = ($this->overall_index >= $this->count) ?
             true : false;
diff --git a/lib/processors/epub_processor.php b/lib/processors/epub_processor.php
index 17ea378c4..966dcddd2 100644
--- a/lib/processors/epub_processor.php
+++ b/lib/processors/epub_processor.php
@@ -123,15 +123,15 @@ class EpubProcessor extends TextProcessor
      */
     function process($page, $url)
     {
-        $summary       = NULL;
+        $summary = NULL;
         $opf_pattern   = "/.opf$/i";
         $html_pattern  = "/.html$/i";
         $xhtml_pattern = "/.xhtml$/i";
         $temp_filename = "epubzipfilename.zip";
-        $epub_url      = 0;
+        $epub_url = 0;
         $epub_language = '';
         file_put_contents($temp_filename,$page);
-        $zip = new ZipArchive;
+        $zip = new ZipArchive;
         if ($zip->open($temp_filename))
         {
             for($i = 0; $i < $zip->numFiles; $i++)
@@ -143,13 +143,16 @@ class EpubProcessor extends TextProcessor
                     // Get the file data from zipped folder
                     $opf_data = $zip->getFromName($filename[$i]);
                     $opf_summary = $this->xmlToObject($opf_data);
-                    for($m = 0;$m <= 10;$m++)
+                    for($m = 0;$m <= 10; $m++)
                     {
-                        for($n = 0;$n <= 10;$n++)
-                        {
-                            if(($opf_summary->children[$m]->children[$n]->
-                                name) == "dc:language")
-                            {
+                        if(!isset($opf_summary->children[$m])) continue;
+                        for($n = 0;$n <= 10; $n++)  {
+                            if(!isset(
+                                $opf_summary->children[$m]->children[$n]))
+                                    continue;
+                            $child = $opf_summary->children[$m]->children[$n];
+                            if( isset($child->name) &&
+                                $child->name == "dc:language") {
                                 $epub_language = $opf_summary->children[$m]->
                                 children[$n]->content ;
                             }
@@ -183,11 +186,11 @@ class EpubProcessor extends TextProcessor
             }
         }
         $summary[self::TITLE] = $epub_title;
-        $summary[self::DESCRIPTION] = $description;
+        $summary[self::DESCRIPTION] = $description;
         $summary[self::LANG] = $epub_language;
         $summary[self::LINKS] = $epub_url;
         $summary[self::PAGE] = $page;
-        unlink($temp_filename);
+        unlink($temp_filename);
         return $summary;
     }

@@ -195,7 +198,7 @@ class EpubProcessor extends TextProcessor
      *  Used to extract the title, author, language and links from
      *  a string consisting of ebook publication data.
      *
-     *  @param string $page xml contents
+     *  @param string $page xml contents
      *
      *  @return array  an information about the contents of the page
      *
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index 9e052426d..27dc5f078 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -259,6 +259,12 @@ class HtmlProcessor extends TextProcessor
         $sites = array();

         $xpath = new DOMXPath($dom);
+        $base_refs = $xpath->evaluate("/html//base");
+        if($base_refs->item(0)) {
+            $tmp_site = $base_refs->item(0)->getAttribute('href');
+            if(strlen($tmp_site) > 0) {$site = $tmp_site;}
+        }
+
         $hrefs = $xpath->evaluate("/html/body//a");

         $i = 0;
@@ -326,7 +332,6 @@ class HtmlProcessor extends TextProcessor

         }

-
        return $sites;
     }

diff --git a/lib/processors/pptx_processor.php b/lib/processors/pptx_processor.php
index ad430f6fc..926eb6b1a 100644
--- a/lib/processors/pptx_processor.php
+++ b/lib/processors/pptx_processor.php
@@ -80,7 +80,7 @@ class PptxProcessor extends TextProcessor
         $zip = new ZipArchive;
         if ($zip->open($filename) === TRUE) {
             $buf= $zip->getFromName("docProps/core.xml");
-            if ($buf){
+            if ($buf) {
                 $dom = self::dom($buf);
                 if($dom !== false) {
                 // Get the title
@@ -100,7 +100,7 @@ class PptxProcessor extends TextProcessor
                 $buf=$zip->getFromName("ppt/slides/slide" . $i . ".xml");
                 if($buf){
                 /* Get description , language and url links asociated
-                 * with each slide*/
+                   with each slide*/
                     $dom = self::dom($buf);
                     $desc=self::description($dom);

@@ -152,15 +152,15 @@ class PptxProcessor extends TextProcessor

         foreach($paras as $para) {
             if($i < MAX_LINKS_PER_PAGE) {
-                $hlink=$para->parentNode->parentNode->
+                $hlink = $para->parentNode->parentNode->
                     getElementsByTagName("t")->item(0)->nodeValue;

-                $url=UrlParser::canonicalLink(
+                $url = UrlParser::canonicalLink(
                     $hlink, $site);
                 if(!UrlParser::checkRecursiveUrl($url)  &&
                     strlen($url) < MAX_URL_LENGTH) {
                     if(isset($sites[$url])) {
-                        $sites[$url] .=" ".$hlink;
+                        $sites[$url] .= " ".$hlink;
                     } else {
                         $sites[$url] = $hlink;
                     }
@@ -262,4 +262,4 @@ class PptxProcessor extends TextProcessor
     }

 }
-?>
\ No newline at end of file
+?>
diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php
index f5d35f100..d52305ff1 100644
--- a/lib/processors/sitemap_processor.php
+++ b/lib/processors/sitemap_processor.php
@@ -44,7 +44,7 @@ require_once BASE_DIR."/lib/url_parser.php";

  /**
  * Used to create crawl summary information
- * for RSS files
+ * for sitemap files
  *
  * @author Chris Pollett
  * @package seek_quarry
diff --git a/models/locale_model.php b/models/locale_model.php
index ac9b49ddf..bc56b18f6 100644
--- a/models/locale_model.php
+++ b/models/locale_model.php
@@ -682,11 +682,9 @@ EOT;
                 }
             }
         }
-
-        return $strings;
         closedir($dh);
+        return $strings;

-        return;
     }
 }
  ?>
diff --git a/tests/epub_processor_test.php b/tests/epub_processor_test.php
index 84dc12ce2..43a8736ef 100644
--- a/tests/epub_processor_test.php
+++ b/tests/epub_processor_test.php
@@ -78,9 +78,7 @@ class EpubProcessorTest extends UnitTest implements CrawlConstants
         $page = file_get_contents($filename);
         $summary=$epub_object->process($page,$url);
         $this->test_objects['summary'] = $summary;
-        $this->testEpubTitleTestCase();
-        $this->testEpubLangTestCase();
-        $this->testEpubDescriptionTestCase();
+
     }

     /**
@@ -114,7 +112,7 @@ class EpubProcessorTest extends UnitTest implements CrawlConstants
         $x = $m[self::LANG];
         $correct_language = "en";
         $description = "Test Passed with correct Language";
-        $this->assertEqual($x,$correct_language,$description);
+        $this->assertEqual($x,$correct_language,$description);
     }

     /**
@@ -126,8 +124,7 @@ class EpubProcessorTest extends UnitTest implements CrawlConstants
         $m = $this->test_objects['summary'] ;
         $x = $m[self::DESCRIPTION];
         $description = "Test Passed with Description information not empty";
-        $this->assertTrue($x,$description);
+        $this->assertTrue($x, $description);
     }
-
-
+
  }

ViewGit