Further BMP fixes, add more logging to index shard, a=chris

Chris Pollett [2011-07-30 16:Jul:th]
Further BMP fixes, add more logging to index shard, a=chris
Filename
configs/config.php
lib/index_archive_bundle.php
lib/index_shard.php
lib/indexing_plugins/indexing_plugin.php
lib/indexing_plugins/recipe_plugin.php
lib/processors/bmp_processor.php
lib/processors/jpg_processor.php
diff --git a/configs/config.php b/configs/config.php
index 6d40983ed..03762df8f 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -228,6 +228,7 @@ define('NORMALIZE_FREQUENCY', 10000);
 $INDEXED_FILE_TYPES =
     array(
             "asp",
+            "bmp",
             "cgi",
             "cfm",
             "cfml",
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index 1e16a1743..a3327fcea 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -374,7 +374,7 @@ class IndexArchiveBundle implements CrawlConstants
      */
     function forceSave()
     {
-        $this->getActiveShard()->save();
+        $this->getActiveShard()->save(false, true);
     }


diff --git a/lib/index_shard.php b/lib/index_shard.php
index b69466f54..2d0170235 100644
--- a/lib/index_shard.php
+++ b/lib/index_shard.php
@@ -1086,14 +1086,22 @@ class IndexShard extends PersistentStructure implements
      *
      *  @param bool $to_string whether output should be written to a string
      *      rather than the default file location
+     *  @param bool $with_logging whether log messages should be written
+     *      as the shard save progresses
      *  @return string serialized shard if output was to string else empty
      *      string
      */
-    public function save($to_string = false)
+    public function save($to_string = false, $with_logging = false)
     {
         $out = "";
         $this->mergeWordPostingsToString();
+        if($with_logging) {
+            crawlLog("Saving index shard .. done merge postings to string");
+        }
         $this->prepareWordsAndPrefixes();
+        if($with_logging) {
+            crawlLog("Saving index shard .. make prefixes");
+        }
         $header =  pack("N", $this->prefixes_len) .
             pack("N", $this->words_len) .
             pack("N", $this->word_docs_len) .
@@ -1104,6 +1112,9 @@ class IndexShard extends PersistentStructure implements
             pack("N", $this->num_link_docs) .
             pack("N", $this->len_all_docs) .
             pack("N", $this->len_all_link_docs);
+        if($with_logging) {
+            crawlLog("Saving index shard .. packed header");
+        }
         if($to_string) {
             $out = $header;
             $this->packWords(NULL);
@@ -1116,10 +1127,16 @@ class IndexShard extends PersistentStructure implements
             fwrite($fh, $header);
             fwrite($fh, $this->prefixes);
             $this->packWords($fh);
+            if($with_logging) {
+                crawlLog("Saving index shard .. wrote dictionary");
+            }
             $this->outputPostingLists($fh);
             fwrite($fh, $this->doc_infos);
             fclose($fh);
         }
+        if($with_logging) {
+            crawlLog("Saving index shard .. done");
+        }
         // clean up by returning to state where could add more docs
         $this->words = array();
         $this->word_docs = "";
diff --git a/lib/indexing_plugins/indexing_plugin.php b/lib/indexing_plugins/indexing_plugin.php
index f112f35d7..230b644eb 100644
--- a/lib/indexing_plugins/indexing_plugin.php
+++ b/lib/indexing_plugins/indexing_plugin.php
@@ -24,6 +24,7 @@
  *
  * @author Priya Gangaraju priya.gangaraju@gmail.com, Chris Pollett
  * @package seek_quarry
+ * @subpackage indexing_plugin
  * @license http://www.gnu.org/licenses/ GPL3
  * @link http://www.seekquarry.com/
  * @copyright 2011
@@ -53,7 +54,7 @@ require_once BASE_DIR."/models/datasources/".DBMS."_manager.php";
  *
  * @author Priya Gangaraju, Chris Pollett
  * @package seek_quarry
- * @subpackage component
+ * @subpackage indexing_plugin
  */
 abstract class IndexingPlugin
 {
diff --git a/lib/indexing_plugins/recipe_plugin.php b/lib/indexing_plugins/recipe_plugin.php
index add69b0b4..d664d0c30 100644
--- a/lib/indexing_plugins/recipe_plugin.php
+++ b/lib/indexing_plugins/recipe_plugin.php
@@ -25,7 +25,7 @@
  *
  * @author Priya Gangaraju priya.gangaraju@gmail.com
  * @package seek_quarry
- * @subpackage component
+ * @subpackage indexing_plugin
  * @license http://www.gnu.org/licenses/ GPL3
  * @link http://www.seekquarry.com/
  * @copyright 2011
@@ -77,9 +77,8 @@ require_once BASE_DIR."/lib/crawl_constants.php";
  *
  * @author Priya Gangaraju, Chris Pollett (reorganized and added documentation)
  * @package seek_quarry
- * @subpackage component
+ * @subpackage indexing_plugin
  */
-
 class RecipePlugin extends IndexingPlugin implements CrawlConstants
 {

@@ -495,6 +494,8 @@ if(!function_exists("getLocaleTag")) {

 /**
  * class to define vertex
+ * @package seek_quarry
+ * @subpackage indexing_plugin
  */
 class Vertex
 {
@@ -520,6 +521,8 @@ class Vertex
 }
 /**
  * class to define edge
+ * @package seek_quarry
+ * @subpackage indexing_plugin
  */
 class Edge
 {
@@ -554,6 +557,8 @@ class Edge
  * the minimum spanning tree using heap. formCluster forms clusters by
  * deleting the most expensive edge. BreadthFirstSearch is used to
  * traverse the MST.
+ * @package seek_quarry
+ * @subpackage indexing_plugin
  */
 class Tree
 {
@@ -703,6 +708,8 @@ class Tree
 }
 /**
  * heap to maintain the MST
+ * @package seek_quarry
+ * @subpackage indexing_plugin
  */
 class Cluster extends SplHeap
 {
@@ -717,6 +724,8 @@ class Cluster extends SplHeap
 }
 /**
  * heap to maintain the tree
+ * @package seek_quarry
+ * @subpackage indexing_plugin
  */
 class TreeCluster extends SplHeap
 {
@@ -732,6 +741,8 @@ class TreeCluster extends SplHeap

 /**
  * queue for the BFS traversal
+ * @package seek_quarry
+ * @subpackage indexing_plugin
  */
 class Queue
 {
diff --git a/lib/processors/bmp_processor.php b/lib/processors/bmp_processor.php
index ac14553c0..5e3ac9888 100644
--- a/lib/processors/bmp_processor.php
+++ b/lib/processors/bmp_processor.php
@@ -63,6 +63,11 @@ class BmpProcessor extends ImageProcessor
      * Size in bytes of BMP header
      */
     const BMP_HEADER_LEN = 108;
+    /**
+     * Maximum pixel width or height
+     */
+    const MAX_DIM = 1000;
+
     /**
      * {@inheritdoc}
      */
@@ -70,7 +75,7 @@ class BmpProcessor extends ImageProcessor
     {
         if(is_string($page)) {
             file_put_contents(CRAWL_DIR."/cache/tmp.bmp", $page);
-            $image = @imagecreatefrombmp(CRAWL_DIR."/cache/tmp.bmp");
+            $image = $this->imagecreatefrombmp(CRAWL_DIR."/cache/tmp.bmp");
             $thumb_string = self::createThumb($image);
             $summary[self::TITLE] = "";
             $summary[self::DESCRIPTION] = "Image of ".
@@ -108,7 +113,7 @@ class BmpProcessor extends ImageProcessor
         $header = substr($hex, 0, self::BMP_HEADER_LEN);


-        $can_understand_flag= substr($header, 0, 4) == "424d";
+        $can_understand_flag = substr($header, 0, 4) == "424d";
         // get parameters of image from header bytes
         if ($can_understand_flag) {
             $header_parts = str_split($header, 2);
@@ -116,7 +121,8 @@ class BmpProcessor extends ImageProcessor
             $height = hexdec($header_parts[23] . $header_parts[22]);
             $bits_per_pixel = hexdec($header_parts[29] . $header_parts[28]);
             $can_understand_flag = (($bits_per_pixel == 24) ||
-                ($bits_per_pixel == 32));
+                ($bits_per_pixel == 32)) && ($width <
+                self::MAX_DIM && $height < self::MAX_DIM );
             unset($header_parts);
         }

@@ -139,7 +145,6 @@ class BmpProcessor extends ImageProcessor
         $body_size = strlen($body)/2;
         $header_size = ($width * $height);

-        // Set-up padding flag
         // Set-up padding flag
         $padding_flag = ($body_size > ($header_size * 3) + 4);

diff --git a/lib/processors/jpg_processor.php b/lib/processors/jpg_processor.php
index a9979e131..f15a50ec6 100755
--- a/lib/processors/jpg_processor.php
+++ b/lib/processors/jpg_processor.php
@@ -33,6 +33,8 @@

 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

+ini_set("gd.jpeg_ignore_warning", 1);
+
 /** Used for the getDocumentFilename method in UrlParser */
 require_once BASE_DIR."/lib/url_parser.php";
 /** Load base class, if needed */
ViewGit