diff --git a/configs/config.php b/configs/config.php index 6d40983ed..03762df8f 100755 --- a/configs/config.php +++ b/configs/config.php @@ -228,6 +228,7 @@ define('NORMALIZE_FREQUENCY', 10000); $INDEXED_FILE_TYPES = array( "asp", + "bmp", "cgi", "cfm", "cfml", diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index 1e16a1743..a3327fcea 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -374,7 +374,7 @@ class IndexArchiveBundle implements CrawlConstants */ function forceSave() { - $this->getActiveShard()->save(); + $this->getActiveShard()->save(false, true); } diff --git a/lib/index_shard.php b/lib/index_shard.php index b69466f54..2d0170235 100644 --- a/lib/index_shard.php +++ b/lib/index_shard.php @@ -1086,14 +1086,22 @@ class IndexShard extends PersistentStructure implements * * @param bool $to_string whether output should be written to a string * rather than the default file location + * @param bool $with_logging whether log messages should be written + * as the shard save progresses * @return string serialized shard if output was to string else empty * string */ - public function save($to_string = false) + public function save($to_string = false, $with_logging = false) { $out = ""; $this->mergeWordPostingsToString(); + if($with_logging) { + crawlLog("Saving index shard .. done merge postings to string"); + } $this->prepareWordsAndPrefixes(); + if($with_logging) { + crawlLog("Saving index shard .. make prefixes"); + } $header = pack("N", $this->prefixes_len) . pack("N", $this->words_len) . pack("N", $this->word_docs_len) . @@ -1104,6 +1112,9 @@ class IndexShard extends PersistentStructure implements pack("N", $this->num_link_docs) . pack("N", $this->len_all_docs) . pack("N", $this->len_all_link_docs); + if($with_logging) { + crawlLog("Saving index shard .. packed header"); + } if($to_string) { $out = $header; $this->packWords(NULL); @@ -1116,10 +1127,16 @@ class IndexShard extends PersistentStructure implements fwrite($fh, $header); fwrite($fh, $this->prefixes); $this->packWords($fh); + if($with_logging) { + crawlLog("Saving index shard .. wrote dictionary"); + } $this->outputPostingLists($fh); fwrite($fh, $this->doc_infos); fclose($fh); } + if($with_logging) { + crawlLog("Saving index shard .. done"); + } // clean up by returning to state where could add more docs $this->words = array(); $this->word_docs = ""; diff --git a/lib/indexing_plugins/indexing_plugin.php b/lib/indexing_plugins/indexing_plugin.php index f112f35d7..230b644eb 100644 --- a/lib/indexing_plugins/indexing_plugin.php +++ b/lib/indexing_plugins/indexing_plugin.php @@ -24,6 +24,7 @@ * * @author Priya Gangaraju priya.gangaraju@gmail.com, Chris Pollett * @package seek_quarry + * @subpackage indexing_plugin * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2011 @@ -53,7 +54,7 @@ require_once BASE_DIR."/models/datasources/".DBMS."_manager.php"; * * @author Priya Gangaraju, Chris Pollett * @package seek_quarry - * @subpackage component + * @subpackage indexing_plugin */ abstract class IndexingPlugin { diff --git a/lib/indexing_plugins/recipe_plugin.php b/lib/indexing_plugins/recipe_plugin.php index add69b0b4..d664d0c30 100644 --- a/lib/indexing_plugins/recipe_plugin.php +++ b/lib/indexing_plugins/recipe_plugin.php @@ -25,7 +25,7 @@ * * @author Priya Gangaraju priya.gangaraju@gmail.com * @package seek_quarry - * @subpackage component + * @subpackage indexing_plugin * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2011 @@ -77,9 +77,8 @@ require_once BASE_DIR."/lib/crawl_constants.php"; * * @author Priya Gangaraju, Chris Pollett (reorganized and added documentation) * @package seek_quarry - * @subpackage component + * @subpackage indexing_plugin */ - class RecipePlugin extends IndexingPlugin implements CrawlConstants { @@ -495,6 +494,8 @@ if(!function_exists("getLocaleTag")) { /** * class to define vertex + * @package seek_quarry + * @subpackage indexing_plugin */ class Vertex { @@ -520,6 +521,8 @@ class Vertex } /** * class to define edge + * @package seek_quarry + * @subpackage indexing_plugin */ class Edge { @@ -554,6 +557,8 @@ class Edge * the minimum spanning tree using heap. formCluster forms clusters by * deleting the most expensive edge. BreadthFirstSearch is used to * traverse the MST. + * @package seek_quarry + * @subpackage indexing_plugin */ class Tree { @@ -703,6 +708,8 @@ class Tree } /** * heap to maintain the MST + * @package seek_quarry + * @subpackage indexing_plugin */ class Cluster extends SplHeap { @@ -717,6 +724,8 @@ class Cluster extends SplHeap } /** * heap to maintain the tree + * @package seek_quarry + * @subpackage indexing_plugin */ class TreeCluster extends SplHeap { @@ -732,6 +741,8 @@ class TreeCluster extends SplHeap /** * queue for the BFS traversal + * @package seek_quarry + * @subpackage indexing_plugin */ class Queue { diff --git a/lib/processors/bmp_processor.php b/lib/processors/bmp_processor.php index ac14553c0..5e3ac9888 100644 --- a/lib/processors/bmp_processor.php +++ b/lib/processors/bmp_processor.php @@ -63,6 +63,11 @@ class BmpProcessor extends ImageProcessor * Size in bytes of BMP header */ const BMP_HEADER_LEN = 108; + /** + * Maximum pixel width or height + */ + const MAX_DIM = 1000; + /** * {@inheritdoc} */ @@ -70,7 +75,7 @@ class BmpProcessor extends ImageProcessor { if(is_string($page)) { file_put_contents(CRAWL_DIR."/cache/tmp.bmp", $page); - $image = @imagecreatefrombmp(CRAWL_DIR."/cache/tmp.bmp"); + $image = $this->imagecreatefrombmp(CRAWL_DIR."/cache/tmp.bmp"); $thumb_string = self::createThumb($image); $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = "Image of ". @@ -108,7 +113,7 @@ class BmpProcessor extends ImageProcessor $header = substr($hex, 0, self::BMP_HEADER_LEN); - $can_understand_flag= substr($header, 0, 4) == "424d"; + $can_understand_flag = substr($header, 0, 4) == "424d"; // get parameters of image from header bytes if ($can_understand_flag) { $header_parts = str_split($header, 2); @@ -116,7 +121,8 @@ class BmpProcessor extends ImageProcessor $height = hexdec($header_parts[23] . $header_parts[22]); $bits_per_pixel = hexdec($header_parts[29] . $header_parts[28]); $can_understand_flag = (($bits_per_pixel == 24) || - ($bits_per_pixel == 32)); + ($bits_per_pixel == 32)) && ($width < + self::MAX_DIM && $height < self::MAX_DIM ); unset($header_parts); } @@ -139,7 +145,6 @@ class BmpProcessor extends ImageProcessor $body_size = strlen($body)/2; $header_size = ($width * $height); - // Set-up padding flag // Set-up padding flag $padding_flag = ($body_size > ($header_size * 3) + 4); diff --git a/lib/processors/jpg_processor.php b/lib/processors/jpg_processor.php index a9979e131..f15a50ec6 100755 --- a/lib/processors/jpg_processor.php +++ b/lib/processors/jpg_processor.php @@ -33,6 +33,8 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} +ini_set("gd.jpeg_ignore_warning", 1); + /** Used for the getDocumentFilename method in UrlParser */ require_once BASE_DIR."/lib/url_parser.php"; /** Load base class, if needed */