diff --git a/bin/fetcher.php b/bin/fetcher.php index cfb5ba4bb..484b06a84 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -60,15 +60,15 @@ foreach(glob(BASE_DIR."/lib/processors/*_processor.php") as $filename) { /** To support English language stemming of words (jumps, jumping --> jump)*/ require_once BASE_DIR."/lib/porter_stemmer.php"; -/** */ +/** Used to manipulate urls*/ require_once BASE_DIR."/lib/url_parser.php"; -/** */ +/** Used to extract summaries from web pages*/ require_once BASE_DIR."/lib/phrase_parser.php"; /** for crawlHash and crawlLog */ require_once BASE_DIR."/lib/utility.php"; /** for crawlDaemon function */ require_once BASE_DIR."/lib/crawl_daemon.php"; -/** */ +/** Used to fetches web pages and info from queue server*/ require_once BASE_DIR."/lib/fetch_url.php"; /** Loads common constants for web crawling*/ require_once BASE_DIR."/lib/crawl_constants.php"; @@ -107,65 +107,93 @@ mb_regex_encoding("UTF-8"); class Fetcher implements CrawlConstants { /** + * Reference to a database object. Used since has directory manipulation + * functions * @var object */ var $db; /** - * @var object + * Url or IP address of the queue_server to get sites to crawl from + * @var string */ var $queue_server; /** + * Contains each of the file extenstions this fetcher will try to process * @var array */ var $indexed_file_types; /** + * An associative array of (mimetype => name of processor class to handle) + * pairs. * @var array */ var $page_processors; /** + * WebArchiveBundle used to store complete web pages and auxiliary data * @var object */ var $web_archive; /** + * Timestamp of the current crawl * @var int */ var $crawl_time; /** + * Contains the list of web pages to crawl from the queue_server * @var array */ var $to_crawl; /** + * Summary information for visited sites that the fetcher hasn't sent to + * the queue_server yet * @var array */ var $found_sites; /** + * Timestamp from the queue_server of the current schedule of sites to + * download. This is sent back to the server once this schedule is completed + * to help the queue server implement crawl-delay if needed. * @var int */ var $schedule_time; /** + * The sum of the number of words of all the page description for the current + * crawl. This is used in computing document statistics. * @var int */ var $sum_seen_site_description_length; /** + * The sum of the number of words of all the page titles for the current + * crawl. This is used in computing document statistics. * @var int */ var $sum_seen_title_length; /** + * The sum of the number of words in all the page links for the current + * crawl. This is used in computing document statistics. * @var int */ var $sum_seen_site_link_length; /** + * Number of sites crawled in the current crawl * @var int */ var $num_seen_sites; /** + * Stores the name of the ordering used to crawl pages. This is used in a + * switch/case when computing weights of urls to be crawled before sending + * these new urls back to the queue_server. * @var string */ var $crawl_order; /** + * Sets up the field variables for that crawling can begin * + * @param array $indexed_file_types file extensions to index + * @param array $page_processors (mimetype => name of processor) pairs + * @param string $queue_server URL or IP address of the queue server */ function __construct($indexed_file_types, $page_processors, $queue_server) { @@ -189,7 +217,7 @@ class Fetcher implements CrawlConstants $this->num_seen_sites = 0; //we will get the correct crawl order from the queue_server - $this->crawl_order = "OPIC"; + $this->crawl_order = self::PAGE_IMPORTANCE; } @@ -209,7 +237,13 @@ class Fetcher implements CrawlConstants } /** + * Main loop for the fetcher. * + * Checks for stop message, checks queue server if crawl has changed and + * for new pages to crawl. Loop gets a group of next pages to crawl if + * there are pages left to crawl (otherwise sleep 5 seconds). It downloads + * these pages, deplicates them, and updates the found site info with the + * result before looping again. */ function loop() { @@ -292,6 +326,11 @@ class Fetcher implements CrawlConstants } /** + * Deletes any crawl web archive bundles not in the provided array of crawls + * + * @param array $still_active_crawls those crawls which should be deleted, + * so all others will be deleted + * @see loop() */ function deleteOldCrawls(&$still_active_crawls) { @@ -341,13 +380,17 @@ class Fetcher implements CrawlConstants } /** + * Get status, current crawl, crawl order, and new site information from + * the queue_server. + * + * @return array containing this info */ function checkScheduler() { $info = array(); - + $info[self::STATUS] = self::CONTINUE_STATE; if(count($this->to_crawl) > 0) { - $info[self::STATUS] = self::CONTINUE_STATE; + return; } @@ -374,18 +417,24 @@ class Fetcher implements CrawlConstants if(isset($info[self::SCHEDULE_TIME])) { $this->schedule_time = $info[self::SCHEDULE_TIME]; } + crawlLog(" Time to check Scheduler ".(changeInMicrotime($start_time))); - return $info; + return $info; } /** + * Prepare an array of up to NUM_MULTI_CURL_PAGES' worth of sites to be + * downloaded in one go using the to_crawl array. Delete these sites + * from the to_crawl array. + * + * @return array sites which are ready to be downloaded */ function getFetchSites() { $web_archive = $this->web_archive; - + $start_time = microtime(); $seeds = array(); @@ -439,6 +488,11 @@ class Fetcher implements CrawlConstants } /** + * Does page deduplication on an array of downloaded pages using a + * BloomFilterBundle of $this->web_archive. Deduplication based + * on summaries is also done on the queue server. + * + * @param array &$site_pages pages to deduplicate */ function deleteSeenPages(&$site_pages) { @@ -466,6 +520,14 @@ class Fetcher implements CrawlConstants } /** + * Processes an array of downloaded web pages with the appropriate page + * processor. + * + * Summary data is extracted from each non robots.txt file in the array. + * Disallowed paths and crawl-delays are extracted from robots.txt files. + * + * @param array $site_pages a collection of web pages to process + * @return array summary data extracted from these pages */ function processFetchPages($site_pages) { @@ -600,7 +662,13 @@ class Fetcher implements CrawlConstants } /** + * Parses the contents of a robots.txt page extracting disallowed paths and + * Crawl-delay * + * @param array $robot_site array containing info about one robots.txt page + * @return array the $robot_site array with two new fields: one containing + * an array of disallowed paths, the other containing the crawl-delay + * if any */ function processRobotPage($robot_site) { @@ -658,7 +726,15 @@ class Fetcher implements CrawlConstants } /** - */ + * Updates the $this->found_sites array with data from the most recently + * downloaded sites. This means updating the following sub arrays: + * the self::ROBOT_PATHS, self::TO_CRAWL. It checks if there are still + * more urls to crawl or if self::SEEN_URLS has grown larger than + * SEEN_URLS_BEFORE_UPDATE_SCHEDULER. If so, a mini index is built and, + * the queue server is called with the data. + * + * @param array $sites site data to use for the update + */ function updateFoundSites($sites) { $start_time = microtime(); @@ -676,7 +752,7 @@ class Fetcher implements CrawlConstants self::CRAWL_DELAY] = $site[self::CRAWL_DELAY]; } } else { - $this->found_sites[self::SEEN_URLS][] = $site; + $this->found_sites[self::ROBOT_PATHS][] = $site; if(isset($site[self::LINKS])) { if(!isset($this->found_sites[self::TO_CRAWL])) { $this->found_sites[self::TO_CRAWL] = array(); @@ -730,7 +806,16 @@ class Fetcher implements CrawlConstants } /** + * Updates the queue_server about sites that have been crawled. * + * This method is called if there are currently no more sites to crawl or + * if SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages have been processed. It + * creates a inverted index of the non robot pages crawled and then compresses + * and does a post request to send the page summary data, robot data, + * to crawl url data, and inverted index back to the server. In the event + * that the server doesn't acknowledge it loops and tries again after a + * delay until the post is successful. At this point, memory for this data + * is freed. */ function updateScheduler() { @@ -788,6 +873,15 @@ class Fetcher implements CrawlConstants } /** + * Builds an inverted index (word --> {docs it appears in}) for the current + * batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages. This inverted + * is then merged by the queue_server into the inverted index of the + * current generation of the crawl. The complete inverted index for the + * whole crawl is built out of these inverted indexes for generations. + * The point of computing a partial inverted index on the fetcher is to + * reduce some of the computational burden on the queue server. The + * resulting mini index computed by buildMiniInvertedIndex() is stored in + * $this->found_sites[self::INVERTED_INDEX] * */ function buildMiniInvertedIndex() @@ -926,7 +1020,11 @@ class Fetcher implements CrawlConstants /** + * Used to compute number of words in each component (title, description, + * links) of a document separately as well as compute average amongst the + * current group of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many docs. * + * @return array computed statistics */ function computeDocumentStatistics() { @@ -1003,7 +1101,7 @@ class Fetcher implements CrawlConstants /** * Computes a sum of the values of an associative array of key-value pairs * - * @param array the associative array to compute the sum of + * @param array &$arr the associative array to compute the sum of */ function sumCountArray(&$arr) { diff --git a/bin/queue_server.php b/bin/queue_server.php index b0e3e5a10..61a53c88d 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -52,10 +52,10 @@ require_once BASE_DIR."/models/datasources/".DBMS."_manager.php"; /** Load the class that maintains our URL queue */ require_once BASE_DIR."/lib/web_queue_bundle.php"; -/** */ +/** Load word->{array of docs with word} index class */ require_once BASE_DIR."/lib/index_archive_bundle.php"; -/** */ +/** Used for manipulating urls*/ require_once BASE_DIR."/lib/url_parser.php"; /** For crawlHash function */ diff --git a/controllers/settings_controller.php b/controllers/settings_controller.php index 189d46934..b73802702 100755 --- a/controllers/settings_controller.php +++ b/controllers/settings_controller.php @@ -114,7 +114,8 @@ class SettingsController extends Controller $crawls = $this->crawlModel->getCrawlList(); $data['CRAWLS'] = array(); foreach($crawls as $crawl) { - $data['CRAWLS'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION']; + $data['CRAWLS'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION']. + " ... ".$crawl['COUNT']." urls"; } $crawl_stamps = array_keys($data['CRAWLS']); if($token_okay && isset($_REQUEST['index_ts']) && diff --git a/lib/bloom_filter_bundle.php b/lib/bloom_filter_bundle.php index 6710bd8d9..32ff0be7b 100644 --- a/lib/bloom_filter_bundle.php +++ b/lib/bloom_filter_bundle.php @@ -35,14 +35,15 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** * - * Code used to manage a bloom filter in-memory and in file - * a Bloom filter is used to store a set of objects. - * It can support inserts into the set and it can also be - * used to check membership in the set. + * A BloomFilterBundle is a directory of BloomFilterFile. + * The filter bundle, like a Bloom filter, also acts as a set, + * but once the active filter in it fills up a new filter is + * added to the bundle so that more data can be stored. * * @author Chris Pollett * @package seek_quarry * @subpackage library + * @see BloomFilterFile */ class BloomFilterBundle { @@ -78,9 +79,14 @@ class BloomFilterBundle const default_filter_size = 10000000; /** + * Creates or loads if already exists the directory structure and + * BloomFilterFiles used by this bundle * + * @param $dir_name directory when this bundles data is stored + * @param $filter_size the size of an individual filter in this bundle + * once a filter is filled a new one is added to the directory */ - public function __construct($dir_name, + function __construct($dir_name, $filter_size = self::default_filter_size ) { $this->dir_name = $dir_name; @@ -107,9 +113,14 @@ class BloomFilterBundle } /** + * Inserts a $value into the BloomFilterBundle * + * This involves inserting into the current filter, if the filter + * is full, a new filter is added before the value is added + * + * @param string $value a item to add to the filter bundle */ - public function add($value) + function add($value) { if($this->current_filter_count >= $this->filter_size) { $this->current_filter->save(); @@ -130,9 +141,14 @@ class BloomFilterBundle } /** + * Removes from the passed array those elements $elt who either are not in + * the filter bundle or whose $elt[$field_name] is not in the bundle. * + * @param array &$arr the array to remove elements from + * @param string $field_name if not NULL the field name of $arr to use to + * do filtering */ - public function differenceFilter(&$arr, $field_name = NULL) + function differenceFilter(&$arr, $field_name = NULL) { $num_filters = $this->num_filters; @@ -160,9 +176,10 @@ class BloomFilterBundle } /** - * + * Loads from the filter bundles' meta.txt the meta data associated with + * this filter bundle and stores this data into field variables */ - public function loadMetaData() + function loadMetaData() { if(file_exists($this->dir_name.'/meta.txt')) { $meta = unserialize( @@ -181,7 +198,7 @@ class BloomFilterBundle * Saves the meta data (number of filter, number of items stored, and size) * of the bundle */ - public function saveMetaData() + function saveMetaData() { $meta = array(); $meta['NUM_FILTERS'] = $this->num_filters; @@ -194,7 +211,7 @@ class BloomFilterBundle /** * Used to save to disk all the file data associated with this bundle */ - public function forceSave() + function forceSave() { $this->saveMetaData(); $this->current_filter->save(); diff --git a/lib/bloom_filter_file.php b/lib/bloom_filter_file.php index d816928c9..77c494d87 100755 --- a/lib/bloom_filter_file.php +++ b/lib/bloom_filter_file.php @@ -34,14 +34,13 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * + * Load base class with methods for loading and saving this structure */ require_once "persistent_structure.php"; /** - * - * Code used to manage a bloom filter in-memory and in file - * a Bloom filter is used to store a set of objects. + * Code used to manage a bloom filter in-memory and in file. + * A Bloom filter is used to store a set of objects. * It can support inserts into the set and it can also be * used to check membership in the set. * @@ -52,14 +51,35 @@ require_once "persistent_structure.php"; class BloomFilterFile extends PersistentStructure { + /** + * Number of bit positions in the Bloom filter used to say an item is + * in the filter + * @var int + */ var $num_keys; + /** + * Size in bits of the packed string array used to store the filter's + * contents + * @var int + */ var $filter_size; + /** + * Packed string used to store the Bloom filters + * @var string + */ var $filter; /** + * Initializes the fields of the BloomFilter and its base + * PersistentStructure. * + * @param string $fname name of the file to store the BloomFilter data in + * @param int $num_values the maximum number of values that will be stored + * in the BloomFilter. Filter will be sized so the odds of a false + * positive are roughly one over this value + * @param int $save_frequency how often to store the BloomFilter to disk */ - public function __construct($fname, $num_values, + function __construct($fname, $num_values, $save_frequency = self::DEFAULT_SAVE_FREQUENCY) { $log2 = log(2); @@ -75,9 +95,11 @@ class BloomFilterFile extends PersistentStructure } /** + * Inserts the provided item into the Bloomfilter * + * @param string $value item to add to filter */ - public function add($value) + function add($value) { $num_keys = $this->num_keys; for($i = 0; $i < $num_keys; $i++) { @@ -89,9 +111,12 @@ class BloomFilterFile extends PersistentStructure } /** + * Checks if the BloomFilter contains the provided $value * + * @param string $value item to check if is in the BloomFilter + * @return bool whether $value was in the filter or not */ - public function contains($value) + function contains($value) { $num_keys = $this->num_keys; for($i = 0; $i < $num_keys; $i++) { @@ -106,7 +131,10 @@ class BloomFilterFile extends PersistentStructure } /** + * Hashes $value to a bit position in the BloomFilter * + * @param string $value value to map to a bit position in the filter + * @return int the bit position mapped to */ function getHashBitPosition($value) { @@ -120,7 +148,9 @@ class BloomFilterFile extends PersistentStructure } /** + * Sets to true the ith bit position in the filter. * + * @param int $i the position to set to true */ function setBit($i) { @@ -129,13 +159,16 @@ class BloomFilterFile extends PersistentStructure $bit_in_byte = $i - ($byte << 3); $tmp = $this->filter[$byte]; - + $this->filter[$byte] = $tmp | chr(1 << $bit_in_byte); } /** + * Looks up the value of the ith bit position in the filter * + * @param int $i the position to look up + * @return bool the value of the looked up position */ function getBit($i) { diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index 022265052..da45cb25a 100644 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -34,7 +34,9 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * + * Shared constants and enums used by components that are involved in the + * crawling process + * * @author Chris Pollett * @package seek_quarry * @subpackage library diff --git a/lib/crawl_daemon.php b/lib/crawl_daemon.php index 6c46a3c3c..cb3039077 100644 --- a/lib/crawl_daemon.php +++ b/lib/crawl_daemon.php @@ -34,14 +34,20 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * + * Load system-wide defines */ - require_once BASE_DIR."/configs/config.php"; -require_once BASE_DIR."/lib/utility.php"; //for crawlLog +/** + * Load the crawlLog function + */ +require_once BASE_DIR."/lib/utility.php"; +/** + * Load common constants for crawling + */ require_once BASE_DIR."/lib/crawl_constants.php"; /** + * Used to run scripts as a daemon on *nix systems * * @author Chris Pollett * @package seek_quarry @@ -49,10 +55,19 @@ require_once BASE_DIR."/lib/crawl_constants.php"; */ class CrawlDaemon implements CrawlConstants { + + /** + * Name prefix to be used on files associated with this daemon + * (such as lock like and messages) + * @var string + * @static + */ static $name; /** + * Callback function to handle signals sent to this daemon * + * @param int $signo signal sent to the daemon */ static function processHandler($signo) { @@ -65,7 +80,7 @@ class CrawlDaemon implements CrawlConstants file_put_contents( CRAWL_DIR."/schedules/".self::$name."_messages.txt", serialize($info)); - unlink(CRAWL_DIR."/schedules/".self::$name."_lock.txt"); + unlink(CRAWL_DIR."/schedules/".self::$name."_lock.txt"); break; case SIGSEGV: @@ -79,7 +94,17 @@ class CrawlDaemon implements CrawlConstants } /** + * Used to send a message the given daemon or run the program in the + * foreground. * + * @param array $argv an array of command line arguments. The argument + * start will check if the process control functions exists if these + * do they will fork and detach a child process to act as a daemon. + * a lock file will be created to prevent additional daemons from + * running. If the message is stop then a message file is written to + * tell the daemon to stop. If the argument is terminal then the + * program won't be run as a daemon. + * @param string $name the prefix to use for lock and message files */ static function init($argv, $name) { @@ -103,8 +128,6 @@ class CrawlDaemon implements CrawlConstants //the next code is for running as a daemon on *nix systems $terminal_flag = strcmp($argv[1], "terminal") == 0; if(function_exists("pcntl_fork") && !$terminal_flag) { - - $pid = pcntl_fork(); if ($pid == -1) { die("could not fork"); diff --git a/lib/hash_table.php b/lib/hash_table.php index 7a547d06b..49474e0dd 100755 --- a/lib/hash_table.php +++ b/lib/hash_table.php @@ -52,21 +52,54 @@ require_once "utility.php"; class HashTable extends StringArray { + /** + * + * @var int + */ var $key_size; + /** + * + * @var int + */ var $value_size; + /** + * + * @var string + */ var $null; + /** + * + * @var string + */ var $deleted; - + /** + * + * @var int + */ var $count; + /** + * + * + */ const ALWAYS_RETURN_PROBE = 1; + /** + * + * + */ const RETURN_PROBE_ON_KEY_FOUND = 0; + /** + * + * + */ const RETURN_VALUE = -1; /** - */ - public function __construct($fname, $num_values, $key_size, $value_size, + * + * + */ + function __construct($fname, $num_values, $key_size, $value_size, $save_frequency = self::DEFAULT_SAVE_FREQUENCY) { $this->key_size = $key_size; @@ -80,7 +113,11 @@ class HashTable extends StringArray $key_size + $value_size, $save_frequency); } - public function insert($key, $value) + /** + * + * + */ + function insert($key, $value) { $null = $this->null; $deleted = $this->deleted; @@ -124,13 +161,20 @@ class HashTable extends StringArray } + /** + * + * + */ function lookup($key, $return_probe_value = self::RETURN_VALUE) { return $this->lookupArray( $key, array($this->null), $return_probe_value); } - + /** + * + * + */ function lookupArray($key, $null_array, $return_probe_value = self::RETURN_VALUE) { @@ -167,7 +211,11 @@ class HashTable extends StringArray } - public function delete($key) + /** + * + * + */ + function delete($key) { $deleted = pack("H2x".($this->key_size + $this->value_size - 1), "FF"); //deletes @@ -185,6 +233,10 @@ class HashTable extends StringArray } + /** + * + * + */ function getEntry($i) { $raw = $this->get($i); @@ -194,6 +246,10 @@ class HashTable extends StringArray return array($key, $value); } + /** + * + * + */ function hash($key) { $hash = substr(md5($key, true), 0, 4); diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index 5b349929a..86f3083f1 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -34,19 +34,42 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * Load in all dependencies for IndexArchiveBundle, if necessary - */ - + * Summaries and word document list stored in WebArchiveBundle's so load it + */ require_once 'web_archive_bundle.php'; +/** + * Filters used to check if words appear in a given generation + */ require_once 'bloom_filter_file.php'; +/** + * + */ require_once 'bloom_filter_bundle.php'; +/** + * + */ require_once 'gzip_compressor.php'; +/** + * + */ require_once 'non_compressor.php'; +/** + * + */ require_once 'utility.php'; -/** Loads common constants for web crawling*/ +/** Loads common constants for web crawling + * + */ require_once 'crawl_constants.php'; /** + * Enumerative interface for common constants between WordIterator and + * IndexArchiveBundle + * + * These constants are used as fields in arrays. They are negative to + * distinguish them from normal array elements 0, 1, 2... However, this + * means you need to be slightly careful if you try to sort the array + * as this might screw things up * * @author Chris Pollett * @package seek_quarry @@ -64,7 +87,12 @@ interface IndexingConstants /** + * Callback function used to set the offsets into the archive file from + * the paritcular word info in the header block of a WordArchive * + * @param array $data + * @param array $objects + * @param string $offset_field */ function setOffsetPointers($data, &$objects, $offset_field) { @@ -100,21 +128,22 @@ function setOffsetPointers($data, &$objects, $offset_field) * * @author Chris Pollett * @package seek_quarry + * @subpackage library */ class WordIterator implements IndexingConstants, CrawlConstants { var $word_key; var $index; var $seen_docs; + var $restricted_seen_docs; + var $count_block_unfiltered; var $num_docs; - var $diagnostics; - //common word fields + var $next_offset; var $last_pointed_block; var $list_offset; - //rare word fields var $block_pointers; var $num_full_blocks; @@ -127,25 +156,31 @@ class WordIterator implements IndexingConstants, CrawlConstants /** * */ - public function __construct($word_key, $index, $limit = 0) + public function __construct($word_key, $index, $limit = 0, $info_block = NULL) { $this->word_key = $word_key; $this->index = $index; $this->limit = $limit; - $this->reset(); + $this->reset($info_block); } /** * */ - public function reset() + public function reset($info_block = NULL) { + $this->restricted_seen_docs = 0; + $this->count_block_unfiltered = 0; + $partition = WebArchiveBundle::selectPartition($this->word_key, $this->index->num_partitions_index); - $this->info_block = $this->index->getPhraseIndexInfo($this->word_key); - + if($info_block == NULL) { + $this->info_block = $this->index->getPhraseIndexInfo($this->word_key); + } else { + $this->info_block = $info_block; + } if($this->info_block !== NULL) { $this->num_generations = count($this->info_block['GENERATIONS']); $count_till_generation = $this->info_block[self::COUNT]; @@ -171,8 +206,10 @@ class WordIterator implements IndexingConstants, CrawlConstants } + $this->seen_docs = $count_till_generation - + $this->info_block[self::COUNT]; $this->initGeneration(); - $this->seen_docs = $this->current_pointer * BLOCK_SIZE; + } @@ -193,8 +230,9 @@ class WordIterator implements IndexingConstants, CrawlConstants $this->num_docs = $info_block['TOTAL_COUNT']; $this->num_docs_generation = $info_block[self::COUNT]; - $this->current_pointer = floor($this->limit / BLOCK_SIZE); - + $this->current_pointer = + max(floor(($this->limit - $this->seen_docs) / BLOCK_SIZE), 0); + $this->seen_docs += $this->current_pointer*BLOCK_SIZE; $this->last_block = $info_block[self::END_BLOCK]; $this->num_full_blocks = floor($this->num_docs_generation / BLOCK_SIZE); @@ -215,8 +253,7 @@ class WordIterator implements IndexingConstants, CrawlConstants if($info_block[self::LIST_OFFSET] === NULL) { $this->list_offset = NULL; } else { - $this->list_offset = $info_block[self::LIST_OFFSET][0]; - $this->current_block_num =$info_block[self::LIST_OFFSET][1]; + $this->list_offset = $info_block[self::LIST_OFFSET]; } } @@ -240,14 +277,53 @@ class WordIterator implements IndexingConstants, CrawlConstants if($this->current_pointer == $this->num_full_blocks) { $pages = $this->last_block; } else if ($this->current_pointer >= $this->last_pointed_block) { + /* if there are more than COMMON_WORD_THRESHOLD many + results and we're not at the last block yet + */ if($this->list_offset === NULL) { return -1; } - $doc_block = $this->index->getWordDocBlock($this->word_key, - $this->list_offset, $generation); - - $pages = $doc_block[$this->word_key.":".$this->current_pointer]; + $offset = $this->list_offset; + $found = false; + do { + /* the link list is actually backwards to the order we want + For now, we cycle along the list from the last data + stored until we find the block we want. This is slow + but we are relying on the fact that each generation is + not too big. + */ + $doc_block = $this->index->getWordDocBlock($this->word_key, + $offset, $generation); + $word_keys = array_keys($doc_block); + $found_key = NULL; + foreach($word_keys as $word_key) { + if(strstr($word_key, $this->word_key.":")) { + $found_key = $word_key; + if(isset($doc_block[ + $found_key][self::LIST_OFFSET])) { + //only one list offset/docblock + break; + } + } + } + if($found_key === NULL) { + break; + } + if(isset($doc_block[ + $this->word_key.":".$this->current_pointer])) { + $found = true; + break; + } + $offset = $doc_block[$found_key][self::LIST_OFFSET]; + } while($offset != NULL); + if($found != true) { + $pages = array(); + } else { + $pages = $doc_block[ + $this->word_key.":".$this->current_pointer]; + } } else { + //first COMMON_WORD_THRESHOLD many results fast if(isset($this->block_pointers[$this->current_pointer])) { $doc_block = $this->index->getWordDocBlock($this->word_key, $this->block_pointers[$this->current_pointer], @@ -265,15 +341,18 @@ class WordIterator implements IndexingConstants, CrawlConstants $pages = array(); } } - + if($this->seen_docs < $this->limit) { $diff_offset = $this->limit - $this->seen_docs; + $pages = array_slice($pages, $diff_offset); } + $this->count_block_unfiltered = count($pages); if($restrict_phrases != NULL) { + $out_pages = array(); - if(count($pages) >0 ) { + if(count($pages) > 0 ) { foreach($pages as $doc_key => $doc_info) { if(isset($doc_info[self::SUMMARY_OFFSET])) { @@ -314,28 +393,27 @@ class WordIterator implements IndexingConstants, CrawlConstants public function nextDocsWithWord($restrict_phrases = NULL) { $doc_block = $this->currentDocsWithWord($restrict_phrases); - - $this->seen_docs += count($doc_block); - + if($this->seen_docs < $this->limit) { + $this->seen_docs = $this->count_block_unfiltered + $this->limit; + } else { + $this->seen_docs += $this->count_block_unfiltered; + } + $this->restricted_seen_docs += count($doc_block); if($doc_block == -1 || !is_array($doc_block)) { return NULL; } - if(isset($doc_block[self::LIST_OFFSET]) && - $doc_block[self::LIST_OFFSET] != NULL) { - $this->list_offset = $doc_block[self::LIST_OFFSET]; - } - + $this->current_pointer ++; if($this->current_pointer > $this->num_full_blocks) { $flag = false; while ($this->info_block['CURRENT_GENERATION_INDEX'] < - $this->num_generations -1 && !$flag) { + $this->num_generations - 1 && !$flag) { $this->info_block['CURRENT_GENERATION_INDEX']++; $flag = $this->initGeneration(); } if ($this->info_block['CURRENT_GENERATION_INDEX'] >= - $this->num_generations -1) { - $this->current_pointer = -1; + $this->num_generations - 1) { + $this->current_pointer = - 1; } } @@ -349,10 +427,11 @@ class WordIterator implements IndexingConstants, CrawlConstants * * @author Chris Pollett * @package seek_quarry + * @subpackage library */ class IndexArchiveBundle implements IndexingConstants, CrawlConstants { - + var $diagnostics; var $dir_name; var $description; var $num_partitions_summaries; @@ -627,22 +706,23 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants * */ public function getSummariesByHash($word_key, $limit, $num, - $restrict_phrases = NULL, $phrase_key = NULL) + $restrict_phrases = NULL, $phrase_key = NULL, $phrase_info = NULL) { if($phrase_key == NULL) { $phrase_key = $word_key; } - - $phrase_info = $this->getPhraseIndexInfo($phrase_key); + + if($phrase_info == NULL) { + $phrase_info = $this->getPhraseIndexInfo($phrase_key); + } if($phrase_info == NULL || (isset($phrase_info[self::PARTIAL_COUNT]) && $phrase_info[self::PARTIAL_COUNT] < $limit + $num)) { - $this->addPhraseIndex( $word_key, $restrict_phrases, $phrase_key, $limit + $num); } - $iterator = new WordIterator($phrase_key, $this, $limit); + $iterator = new WordIterator($phrase_key, $this, $limit, $phrase_info); $num_retrieved = 0; $pages = array(); @@ -650,6 +730,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants while(is_array($next_docs = $iterator->nextDocsWithWord()) && $num_retrieved < $num) { $num_docs_in_block = count($next_docs); + foreach($next_docs as $doc_key => $doc_info) { if(isset($doc_info[self::SUMMARY_OFFSET])) { $page = $this->getPage( @@ -764,7 +845,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } } } - $num_generations = count($info['GENERATIONS']); if($num_generations == 0) { return NULL; @@ -776,7 +856,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants $block_info = $this->readPartitionInfoBlock( $partition, $info['GENERATIONS'][$i]); - $sum_count += $block_info[$phrase_key][self::COUNT]; } @@ -799,7 +878,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants $info['CURRENT_GENERATION_INDEX']; $phrase_info['TOTAL_COUNT'] = $info['TOTAL_COUNT']; $phrase_info['GENERATIONS'] = $info['GENERATIONS']; - return $phrase_info; } else { return NULL; @@ -857,7 +935,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants $word_data[$phrase_key] = array_slice($buffer, 0, COMMON_WORD_THRESHOLD); - $this->addPartitionWordData($partition,$word_data, $first_time); + $this->addPartitionWordData($partition, $word_data, $first_time); $first_time = false; $buffer = array_slice($buffer, COMMON_WORD_THRESHOLD); $current_count += COMMON_WORD_THRESHOLD; @@ -954,3 +1032,4 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } ?> + diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php index 37343e4ae..c22b1b664 100755 --- a/lib/phrase_parser.php +++ b/lib/phrase_parser.php @@ -34,7 +34,7 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * load the stem word function, if necessary + * Load the stem word function, if necessary */ require_once BASE_DIR."/lib/porter_stemmer.php"; @@ -44,7 +44,7 @@ require_once BASE_DIR."/lib/porter_stemmer.php"; require_once BASE_DIR."/lib/crawl_constants.php"; /** - * library of functions used to manipulate words and phrases + * Library of functions used to manipulate words and phrases * * * @author Chris Pollett @@ -55,7 +55,11 @@ require_once BASE_DIR."/lib/crawl_constants.php"; class PhraseParser { /** + * Converts a summary of a web page into a string of space separated words * + * @param array $page associateive array of page summary data. Contains + * title, description, and links fields + * @return string the concatenated words extracted from the page summary */ static function extractWordStringPageSummary($page) { @@ -80,7 +84,12 @@ class PhraseParser } /** + * Extracts all phrases (sequences of adjacent words) from $string of + * length less than or equal to $len. * + * @param string $string subject to extract phrases from + * @param int $len longest length of phrases to consider + * @return array pairs of the form (phrase, number of occurrences) */ static function extractPhrasesAndCount($string, $len = MAX_PHRASE_LEN) @@ -99,7 +108,12 @@ class PhraseParser } /** + * Extracts all phrases (sequences of adjacent words) from $string of + * length exactly equal to $len. * + * @param string $string subject to extract phrases from + * @param int $len length of phrases to consider + * @return array pairs of the form (phrase, number of occurrences) */ static function extractPhrasesOfLength($string, $phrase_len) { @@ -114,7 +128,15 @@ class PhraseParser } /** + * Extracts phrases (sequences of adjacent words) from $string of + * length exactly equal to $len, beginning with the $offset'th word. + * This extracts the the $len many words after offset, then the $len + * many words after that, and so on. * + * @param string $string subject to extract phrases from + * @param int $len length of phrases to consider + * @param int $offset the first word to begin with + * @return array pairs of the form (phrase, number of occurrences) */ static function extractPhrasesOfLengthOffset($string, $phrase_len, $offset) diff --git a/lib/porter_stemmer.php b/lib/porter_stemmer.php index 74f6d3f99..890260322 100755 --- a/lib/porter_stemmer.php +++ b/lib/porter_stemmer.php @@ -50,9 +50,33 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} class PorterStemmer { - static $buffer, $k, $j; + /** + * storage used in computing the stem + * @var string + */ + static $buffer; + /** + * Index of the current end of the word at the current state of computing + * its stem + * @var int + */ + static $k; + /** + * Index to start of the suffix of the word being considered for + * manipulation + * @var int + */ + static $j; - public static function stem($word) + /** + * Computes the stem of an English word + * + * For example, jumps, jumping, jumpy, all have jump as a stem + * + * @param string $word the string to stem + * @return string the stem of $words + */ + static function stem($word) { self::$buffer = $word; @@ -74,6 +98,7 @@ class PorterStemmer * Checks to see if the ith character in the buffer is a consonant * * @param int $i the character to check + * @return if the ith character is a constant */ private static function cons($i) { @@ -88,15 +113,17 @@ class PorterStemmer } } - /** m() measures the number of consonant sequences between k0 and j. if c is - * a consonant sequence and v a vowel sequence, and <.> indicates arbitrary - * presence, - - * <c><v> gives 0 - * <c>vc<v> gives 1 - * <c>vcvc<v> gives 2 - * <c>vcvcvc<v> gives 3 + /** + * m() measures the number of consonant sequences between 0 and j. if c is + * a consonant sequence and v a vowel sequence, and [.] indicates arbitrary + * presence, + * <pre> + * [c][v] gives 0 + * [c]vc[v] gives 1 + * [c]vcvc[v] gives 2 + * [c]vcvcvc[v] gives 3 * .... + * </pre> */ private static function m() { @@ -130,7 +157,11 @@ class PorterStemmer } } - /* vowelinstem() is TRUE <=> k0,...j contains a vowel */ + /** + * Checks if 0,...$j contains a vowel + * + * @return bool whether it does not + */ private static function vowelinstem() { @@ -140,7 +171,11 @@ class PorterStemmer return false; } - /* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */ + /** + * Checks if $j,($j-1) contain a double consonant. + * + * @return bool if it does or not + */ private static function doublec($j) { @@ -149,14 +184,17 @@ class PorterStemmer return self::cons($j); } - /* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short word. e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - - */ + /** + * Checks whether the letters at the indices $i-2, $i-1, $i in the buffer + * have the form consonant - vowel - consonant and also if the second c is + * not w,x or y. this is used when trying to restore an e at the end of a + * short word. e.g. + *<pre> + * cav(e), lov(e), hop(e), crim(e), but + * snow, box, tray. + *</pre> + * @return bool whether the letters at indices have the given form + */ private static function cvc($i) { @@ -169,7 +207,12 @@ class PorterStemmer return true; } - /* ends(s) is TRUE <=> k0,...k ends with the string s. */ + /** + * Checks if the buffer currently ends with the string $s + * + * @param string $s string to use for check + * @return bool whether buffer currently ends with $s + */ private static function ends($s) { @@ -184,8 +227,12 @@ class PorterStemmer return true; } - /* setto(s) sets (j+1),...k to the characters in the string s, readjusting - k. */ + /** + * setto($s) sets (j+1),...k to the characters in the string $s, readjusting + * k. + * + * @param string $s string to modify the end of buffer with + */ private static function setto($s) { @@ -195,34 +242,38 @@ class PorterStemmer self::$k = self::$j + $len; } - /* r(s) is used further down. */ - + /** + * Sets the ending in the buffer to $s if the number of consonant sequences + * between $k and $j is positive. + * + * @param string $s what to change the suffix to + */ private static function r($s) { if (self::m() > 0) self::setto($s); } - /* step1ab() gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - ties -> ti - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - - */ + /** step1ab() gets rid of plurals and -ed or -ing. e.g. + * <pre> + * caresses -> caress + * ponies -> poni + * ties -> ti + * caress -> caress + * cats -> cat + * + * feed -> feed + * agreed -> agree + * disabled -> disable + * + * matting -> mat + * mating -> mate + * meeting -> meet + * milling -> mill + * messing -> mess + * + * meetings -> meet + * </pre> + */ private static function step1ab() { @@ -256,7 +307,9 @@ class PorterStemmer } } - /* step1c() turns terminal y to i when there is another vowel in the stem. */ + /** + * step1c() turns terminal y to i when there is another vowel in the stem. + */ private static function step1c() { @@ -266,9 +319,11 @@ class PorterStemmer } - /* step2() maps double suffices to single ones. so -ization ( = -ize plus - -ation) maps to -ize etc.Note that the string before the suffix must give - m() > 0. */ + /** + * step2() maps double suffices to single ones. so -ization ( = -ize plus + * -ation) maps to -ize etc.Note that the string before the suffix must give + * m() > 0. + */ private static function step2() { if(self::$k < 1) return; @@ -314,7 +369,9 @@ class PorterStemmer } } - /* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */ + /** + * step3() deals with -ic-, -full, -ness etc. similar strategy to step2. + */ private static function step3() { @@ -338,8 +395,9 @@ class PorterStemmer } } - /* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */ - + /** + * step4() takes off -ant, -ence etc., in context <c>vcvc<v>. + */ private static function step4() { if(self::$k < 1) return; diff --git a/lib/priority_queue.php b/lib/priority_queue.php index d6973d2ef..b50468948 100755 --- a/lib/priority_queue.php +++ b/lib/priority_queue.php @@ -34,11 +34,20 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * Load in base classes and interfaces,get the crawlHash function, if necessary + * Load in base class */ require_once "string_array.php"; +/** + * + */ require_once "notifier.php"; +/** + * + */ require_once "utility.php"; +/** + * + */ require_once "crawl_constants.php"; /** @@ -53,19 +62,43 @@ require_once "crawl_constants.php"; */ class PriorityQueue extends StringArray implements CrawlConstants { + /** + * + * + */ var $num_values; + /** + * + * + */ var $value_size; + /** + * + * + */ var $weight_size = 4; //size of a float + /** + * + * + */ var $count; + /** + * + * + */ var $min_or_max; + /** + * + * + */ var $notifier; // who to call if move an item in queue /** * */ - public function __construct($fname, $num_values, $value_size, + function __construct($fname, $num_values, $value_size, $min_or_max, $notifier = NULL, $save_frequency = self::DEFAULT_SAVE_FREQUENCY) { @@ -85,7 +118,7 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * */ - public function peek($i = 1) + function peek($i = 1) { if($i < 1 || $i > $this->count) { crawlLog("Peek Index $i not in Range [1, {$this->count}]"); @@ -97,7 +130,7 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * */ - public function poll($i = 1) + function poll($i = 1) { if($i < 1 || $i > $this->count) { crawlLog("Index $i not in Range [1, {$this->count}]"); @@ -118,7 +151,7 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * */ - public function insert($data, $weight) + function insert($data, $weight) { if($this->count == $this->num_values) { return false; @@ -136,7 +169,7 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * */ - public function adjustWeight($i, $delta) + function adjustWeight($i, $delta) { if( ($tmp = $this->peek($i)) === false) { crawlLog("Index $i not in queue adjust weight failed"); @@ -166,7 +199,7 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * */ - public function printContents() + function printContents() { for($i = 1; $i <= $this->count; $i++) { $row = $this->peek($i); @@ -177,7 +210,7 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * */ - public function getContents() + function getContents() { $rows = array(); for($i = 1; $i <= $this->count; $i++) { @@ -190,7 +223,7 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * */ - public function normalize($new_total = NUM_URLS_QUEUE_RAM) + function normalize($new_total = NUM_URLS_QUEUE_RAM) { $count = $this->count; $total_weight = $this->totalWeight(); diff --git a/lib/processors/pdf_processor.php b/lib/processors/pdf_processor.php index 2c5e84112..d83df545e 100755 --- a/lib/processors/pdf_processor.php +++ b/lib/processors/pdf_processor.php @@ -34,7 +34,7 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * + * Load in the base class if necessary */ require_once BASE_DIR."/lib/processors/text_processor.php"; diff --git a/lib/string_array.php b/lib/string_array.php index 7d61ec1c5..3907eb805 100755 --- a/lib/string_array.php +++ b/lib/string_array.php @@ -52,34 +52,41 @@ class StringArray extends PersistentStructure { /** - * - */ - var $filename; - /** - * + * Number of items to be stored in the StringArray + * @var int */ var $num_values; /** - * + * Size of each item in bytes to be stored + * @var int */ - var $array_size; + var $data_size; /** - * + * Number of bytes of storage need by the string array + * @var int */ - var $data_size; + var $array_size; /** - * + * Character string used to store the packed data of the StringArray + * @var string */ var $string_array; /** + * Initiliazes the fields of the StringArray and its parent class + * PersistentStructure. Creates a null filled string array of size + * $this->string_array_size to stored data in. * + * @param string $fname the name of the file to store data persistently in + * @param int $num_values the number of items the StringArray will store + * @param int $data_size the size in bytes of a single item + * @param int $save_frequency how often the StringArray should be stored to + * disk */ public function __construct($fname, $num_values, $data_size, $save_frequency = self::DEFAULT_SAVE_FREQUENCY) { - $this->filename = $fname; $this->num_values = $num_values; $this->data_size = $data_size; @@ -93,7 +100,10 @@ class StringArray extends PersistentStructure /** + * Looks up the ith item in the StringArray * + * @param int $i array index of item to look up + * @return string the looked-up item of length $this->data_size */ public function get($i) { @@ -103,7 +113,11 @@ class StringArray extends PersistentStructure } /** + * Puts data into the ith item of the StringArray * + * @param int $i array index of where to store data + * @param string $data at least $this->data_size many bytes of data to + * store */ public function put($i, $data) { diff --git a/lib/web_archive.php b/lib/web_archive.php index bf4dabf78..b41a00dc7 100755 --- a/lib/web_archive.php +++ b/lib/web_archive.php @@ -50,19 +50,39 @@ require_once "utility.php"; class WebArchive { - const OPEN_AND_CLOSE = 1; - const OPEN = 2; - const CLOSE = 3; - + /** + * + */ var $filename; + /** + * + */ var $iterator_pos; + /** + * + */ var $compressor; + /** + * + */ var $count; /** * */ - public function __construct($fname, $compressor, $fast_construct = false) + const OPEN_AND_CLOSE = 1; + /** + * + */ + const OPEN = 2; + /** + * + */ + const CLOSE = 3; + /** + * + */ + function __construct($fname, $compressor, $fast_construct = false) { $this->filename = $fname; $this->compressor = $compressor; @@ -83,7 +103,7 @@ class WebArchive /** * */ - public function readInfoBlock() + function readInfoBlock() { $fh = fopen($this->filename, "r"); $len = $this->seekEndObjects($fh); @@ -101,7 +121,7 @@ class WebArchive /** * */ - public function writeInfoBlock($fh = NULL, &$data = NULL) + function writeInfoBlock($fh = NULL, &$data = NULL) { $open_flag = false; if($fh == NULL) { @@ -129,7 +149,7 @@ class WebArchive /** * */ - public function seekEndObjects($fh) + function seekEndObjects($fh) { fseek($fh, - 4, SEEK_END); $len_block_arr = unpack("N", fread($fh, 4)); @@ -142,7 +162,7 @@ class WebArchive /** * */ - public function addObjects($offset_field, &$objects, + function addObjects($offset_field, &$objects, $data = NULL, $callback = NULL, $return_flag = true) { @@ -192,7 +212,7 @@ class WebArchive /** * */ - public function open($mode = "r") + function open($mode = "r") { $fh = fopen($this->filename, $mode); return $fh; @@ -201,7 +221,7 @@ class WebArchive /** * Closes a file handle (which should be of a web archive) */ - public function close($fh) + function close($fh) { fclose($fh); } @@ -209,7 +229,7 @@ class WebArchive /** * */ - public function getObjects($offset, $num, $next_flag = true, $fh = NULL) + function getObjects($offset, $num, $next_flag = true, $fh = NULL) { $open_flag = false; @@ -261,7 +281,7 @@ class WebArchive * @param int $num number of objects to return * @return array an array of objects from the web archive */ - public function currentObjects($num) + function currentObjects($num) { return $this->getObjects($this->iterator_pos, $num, false); } @@ -274,7 +294,7 @@ class WebArchive * @param int $num number of objects to return * @return array an array of objects from the web archive */ - public function nextObjects($num) + function nextObjects($num) { return $this->getObjects($this->iterator_pos, $num); } @@ -283,7 +303,7 @@ class WebArchive * Resets the iterator for this web archive to the first object * in the archive */ - public function reset() + function reset() { $this->iterator_pos = 0; } diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php index 72c50d3d3..ff7fac6d4 100755 --- a/lib/web_archive_bundle.php +++ b/lib/web_archive_bundle.php @@ -34,7 +34,7 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * A WebArchiveBundle is a collection of WebArchive, so need definition of + * A WebArchiveBundle is a collection of WebArchive, so load definition of * web archive */ require_once 'web_archive.php'; @@ -104,7 +104,7 @@ class WebArchiveBundle /** * */ - public function __construct($dir_name, $filter_size = -1, + function __construct($dir_name, $filter_size = -1, $num_partitions = NULL, $description = NULL, $compressor = "GzipCompressor") { @@ -173,7 +173,7 @@ class WebArchiveBundle /** * */ - public function addPages($key_field, $offset_field, &$pages) + function addPages($key_field, $offset_field, &$pages) { $partition_queue = array(); for($i = 0; $i < $this->num_partitions; $i++) { @@ -215,7 +215,7 @@ class WebArchiveBundle /** * */ - public function getPage($key, $offset) + function getPage($key, $offset) { $partition = WebArchiveBundle::selectPartition($key, $this->num_partitions); @@ -226,7 +226,7 @@ class WebArchiveBundle /** * */ - public function getPageByPartition($partition, $offset, $file_handle = NULL) + function getPageByPartition($partition, $offset, $file_handle = NULL) { $page_array = $this->getPartition($partition)->getObjects( @@ -242,7 +242,7 @@ class WebArchiveBundle /** * */ - public function addPageFilter($key_field, &$page) + function addPageFilter($key_field, &$page) { if($this->filter_size > 0) { $this->page_exists_filter_bundle->add($page[$key_field]); @@ -255,7 +255,7 @@ class WebArchiveBundle /** * */ - public function addObjectsPartition($offset_field, $partition, + function addObjectsPartition($offset_field, $partition, &$objects, $data = NULL, $callback = NULL, $return_flag = true) { $num_objects = count($objects); @@ -268,7 +268,7 @@ class WebArchiveBundle /** * */ - public function readPartitionInfoBlock($partition) + function readPartitionInfoBlock($partition) { return $this->getPartition($partition)->readInfoBlock(); } @@ -276,7 +276,7 @@ class WebArchiveBundle /** * */ - public function writePartitionInfoBlock($partition, &$data) + function writePartitionInfoBlock($partition, &$data) { $this->getPartition($partition)->writeInfoBlock(NULL, $data); } @@ -284,7 +284,7 @@ class WebArchiveBundle /** * */ - public function differencePageKeysFilter($pages, $key_field) + function differencePageKeysFilter($pages, $key_field) { if($this->filter_size > 0) { $page_array = array(); @@ -303,7 +303,7 @@ class WebArchiveBundle /** * */ - public function differencePagesFilter(&$page_array, $field_name = NULL) + function differencePagesFilter(&$page_array, $field_name = NULL) { $this->page_exists_filter_bundle->differenceFilter( $page_array, $field_name); @@ -312,7 +312,7 @@ class WebArchiveBundle /** * */ - public function forceSave() + function forceSave() { if($this->filter_size > 0) { $this->page_exists_filter_bundle->forceSave(); @@ -322,7 +322,7 @@ class WebArchiveBundle /** * */ - public function getPartition($index, $fast_construct = true) + function getPartition($index, $fast_construct = true) { if(!isset($this->partition[$index])) { //this might not have been open yet @@ -355,7 +355,7 @@ class WebArchiveBundle /** * */ - public static function getArchiveInfo($dir_name) + static function getArchiveInfo($dir_name) { if(!is_dir($dir_name) || !file_exists($dir_name."/description.txt")) { $info = array(); @@ -375,7 +375,7 @@ class WebArchiveBundle /** * */ - public static function selectPartition($value, $num_partitions) + static function selectPartition($value, $num_partitions) { $hash = substr(md5($value, true), 0, 4); diff --git a/lib/web_queue_bundle.php b/lib/web_queue_bundle.php index 13a5abad3..f64f21f67 100755 --- a/lib/web_queue_bundle.php +++ b/lib/web_queue_bundle.php @@ -64,7 +64,9 @@ require_once 'web_archive.php'; require_once 'utility.php'; /** - * + * Encapsulates the data structures needed to have a queue of urls to crawl + * next + * * @author Chris Pollett * * @package seek_quarry @@ -136,12 +138,15 @@ class WebQueueBundle implements Notifier */ var $crawl_delay_filter; + /** + * + */ const max_url_archive_offset = 1000000000; /** * */ - public function __construct($dir_name, + function __construct($dir_name, $filter_size, $num_urls_ram, $min_or_max) { $this->dir_name = $dir_name; @@ -227,7 +232,7 @@ class WebQueueBundle implements Notifier /** * */ - public function addUrlsQueue(&$url_pairs) + function addUrlsQueue(&$url_pairs) { $add_urls = array(); $count = count($url_pairs); @@ -269,7 +274,7 @@ class WebQueueBundle implements Notifier /** * */ - public function containsUrlQueue(&$url) + function containsUrlQueue(&$url) { $hash_url = crawlHash($url, true); $lookup_url = $this->lookupHashTable($hash_url); @@ -279,7 +284,7 @@ class WebQueueBundle implements Notifier /** * */ - public function adjustQueueWeight(&$url, $delta) + function adjustQueueWeight(&$url, $delta) { $hash_url = crawlHash($url, true); $data = $this->lookupHashTable($hash_url); @@ -297,7 +302,7 @@ class WebQueueBundle implements Notifier /** * */ - public function removeQueue($url) + function removeQueue($url) { $hash_url = crawlHash($url, true); $data = $this->lookupHashTable($hash_url); @@ -319,7 +324,7 @@ class WebQueueBundle implements Notifier /** * */ - public function peekQueue($i = 1, $fh = NULL) + function peekQueue($i = 1, $fh = NULL) { $tmp = $this->to_crawl_queue->peek($i); if(!$tmp) { @@ -350,7 +355,7 @@ class WebQueueBundle implements Notifier /** * */ - public function printContents() + function printContents() { $count = $this->to_crawl_queue->count; @@ -360,7 +365,7 @@ class WebQueueBundle implements Notifier } } - public function getContents() + function getContents() { $count = $this->to_crawl_queue->count; $contents = array(); @@ -373,7 +378,7 @@ class WebQueueBundle implements Notifier /** * */ - public function normalize($new_total = NUM_URLS_QUEUE_RAM) + function normalize($new_total = NUM_URLS_QUEUE_RAM) { $this->to_crawl_queue->normalize(); } @@ -383,7 +388,7 @@ class WebQueueBundle implements Notifier /** * */ - public function openUrlArchive($mode = "r") + function openUrlArchive($mode = "r") { return $this->to_crawl_archive->open($mode); } @@ -391,7 +396,7 @@ class WebQueueBundle implements Notifier /** * */ - public function closeUrlArchive($fh) + function closeUrlArchive($fh) { $this->to_crawl_archive->close($fh); } @@ -399,7 +404,7 @@ class WebQueueBundle implements Notifier /** * */ - public function addSeenUrlFilter($url) + function addSeenUrlFilter($url) { $this->url_exists_filter_bundle->add($url); } @@ -407,7 +412,7 @@ class WebQueueBundle implements Notifier /** * */ - public function differenceSeenUrls(&$url_array, $field_name = NULL) + function differenceSeenUrls(&$url_array, $field_name = NULL) { $this->url_exists_filter_bundle->differenceFilter( $url_array, $field_name); @@ -416,7 +421,7 @@ class WebQueueBundle implements Notifier /** * */ - public function addGotRobotTxtFilter($host) + function addGotRobotTxtFilter($host) { $this->got_robottxt_filter->add($host); } @@ -424,7 +429,7 @@ class WebQueueBundle implements Notifier /** * */ - public function containsGotRobotTxt($host) + function containsGotRobotTxt($host) { return $this->got_robottxt_filter->contains($host); } @@ -432,7 +437,7 @@ class WebQueueBundle implements Notifier /** * */ - public function addDisallowedRobotFilter($host) + function addDisallowedRobotFilter($host) { $this->dissallowed_robot_filter->add($host); } @@ -440,7 +445,7 @@ class WebQueueBundle implements Notifier /** * */ - public function containsDisallowedRobot($host_path) + function containsDisallowedRobot($host_path) { return $this->dissallowed_robot_filter->contains($host_path); } @@ -448,7 +453,7 @@ class WebQueueBundle implements Notifier /** * */ - public function getRobotTxtAge() + function getRobotTxtAge() { $creation_time = intval( @@ -460,7 +465,7 @@ class WebQueueBundle implements Notifier /** * */ - public function setCrawlDelay($host, $value) + function setCrawlDelay($host, $value) { $this->crawl_delay_filter->add("-1".$host); //used to say a crawl delay has been set @@ -476,7 +481,7 @@ class WebQueueBundle implements Notifier /** * */ - public function getCrawlDelay($host) + function getCrawlDelay($host) { if(!$this->crawl_delay_filter->contains("-1".$host)) { return -1; @@ -495,7 +500,7 @@ class WebQueueBundle implements Notifier /** * */ - public function constructHashTable($name, $num_values) + function constructHashTable($name, $num_values) { $this->hash_rebuild_count = 0; $this->max_hash_ops_before_rebuild = floor($num_values/4); @@ -505,7 +510,7 @@ class WebQueueBundle implements Notifier /** * */ - public function lookupHashTable($key) + function lookupHashTable($key) { return $this->to_crawl_table->lookup($key); } @@ -513,7 +518,7 @@ class WebQueueBundle implements Notifier /** * */ - public function deleteHashTable($value) + function deleteHashTable($value) { $this->to_crawl_table->delete($value); $this->hash_rebuild_count++; @@ -525,7 +530,7 @@ class WebQueueBundle implements Notifier /** * */ - public function insertHashTable($key, $value) + function insertHashTable($key, $value) { $this->hash_rebuild_count++; if($this->hash_rebuild_count > $this->max_hash_ops_before_rebuild) { @@ -537,7 +542,7 @@ class WebQueueBundle implements Notifier /** * */ - public function rebuildHashTable() + function rebuildHashTable() { crawlLog("Rebuilding Hash table"); $num_values = $this->to_crawl_table->num_values; @@ -571,7 +576,7 @@ class WebQueueBundle implements Notifier * Since offsets are integers, even if the queue is kept relatively small, * periodically we will need to rebuild the archive for storing urls. */ - public function rebuildUrlTable() + function rebuildUrlTable() { crawlLog("Rebuilding URL table"); $dir_name = $this->dir_name; @@ -613,7 +618,7 @@ class WebQueueBundle implements Notifier /** * */ - public function emptyRobotFilters() + function emptyRobotFilters() { unlink($this->dir_name."/got_robottxt.ftr"); unlink($this->dir_name."/dissallowed_robot.ftr"); diff --git a/models/phrase_model.php b/models/phrase_model.php index e2225d2f5..9a5b40dac 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -98,9 +98,6 @@ class PhraseModel extends Model $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $phrase); $phrase_string = preg_replace("/(\s)+/", " ", $phrase_string); - - $phrase_hash = crawlHash($phrase_string); - /* we search using the stemmed words, but we format snippets in the results by bolding either @@ -109,11 +106,17 @@ class PhraseModel extends Model $words = array_keys(PhraseParser::extractPhrasesAndCount($phrase_string)); //stemmed + if(isset($words) && count($words) == 1) { + $phrase_string = $words[0]; + } + $phrase_hash = crawlHash($phrase_string); + + $phrase_info = $index_archive->getPhraseIndexInfo($phrase_hash); - if($index_archive->getPhraseIndexInfo($phrase_hash) != NULL) { + if($phrase_info != NULL) { $results = $index_archive->getSummariesByHash( - $phrase_hash, $low, $results_per_page); + $phrase_hash, $low, $results_per_page, NULL, NULL, $phrase_info); if(count($results) == 0) { $results = NULL;