diff --git a/bin/queue_server.php b/bin/queue_server.php index 8ab704959..abfcb19a8 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -154,7 +154,10 @@ class QueueServer implements CrawlConstants var $most_recent_fetcher; /** + * Makes a queue_server object with the supplied indexed_file_types * + * As part of the creation process, a database manager is initialized so + * the queue_server cna make use of its file/folder manipulation functions. */ function __construct($indexed_file_types) { @@ -247,6 +250,8 @@ class QueueServer implements CrawlConstants /** * + * @param array $info + * @return array */ function handleAdminMessages($info) { @@ -317,6 +322,7 @@ class QueueServer implements CrawlConstants /** * + * @param array $info */ function startCrawl($info) { @@ -394,6 +400,8 @@ class QueueServer implements CrawlConstants /** * + * @param string $base_dir + * @param string $callback_method */ function processDataFile($base_dir, $callback_method) { @@ -441,6 +449,7 @@ class QueueServer implements CrawlConstants /** * + * @param string $file */ function processIndexArchive($file) { @@ -552,6 +561,7 @@ class QueueServer implements CrawlConstants /** * + * @param string $file */ function processRobotArchive($file) { @@ -608,6 +618,7 @@ class QueueServer implements CrawlConstants /** * + * @return array */ function processQueueUrls() { @@ -638,7 +649,8 @@ class QueueServer implements CrawlConstants } /** - * + * @param string $file + * @return array */ function processDataArchive($file) { @@ -782,6 +794,7 @@ class QueueServer implements CrawlConstants /** * + * @param array &$sites */ function deleteSeenUrls(&$sites) { @@ -790,7 +803,17 @@ class QueueServer implements CrawlConstants /** + * Produces a schedule.txt file of url data for a fetcher to crawl next. + * + * The hard part of scheduling is to make sure that the overall crawl + * process obeys robots.txt files. This involves checking the url is in + * an allowed path for that host and it also involves making sure the + * Crawl-delay directive is respected. The first fetcher that contacts the + * server requesting data to crawl will get the schedule.txt + * produced by produceFetchBatch() at which point it will be unlinked + * (these latter thing are controlled in FetchController). * + * @see FetchController */ function produceFetchBatch() { @@ -985,7 +1008,15 @@ class QueueServer implements CrawlConstants } /** + * Gets the first unfilled schedule slot after $index in $arr + * + * A schedule of sites for a fetcher to crawl consists of MAX_FETCH_SIZE + * many slots earch of which could eventually hold url information. + * This function is used to schedule slots for crawl-delayed host. * + * @param int $index location to begin searching for an empty slot + * @param array $arr list of slots to look in + * @return int index of first available slot */ function getEarliestSlot($index, $arr) { @@ -1002,7 +1033,11 @@ class QueueServer implements CrawlConstants /** + * Checks if url belongs to a list of sites that are allowed to be + * crawled * + * @param string $url url to check + * @return bool whether is allowed to be crawled or not */ function allowedToCrawlSite($url) { @@ -1018,7 +1053,11 @@ class QueueServer implements CrawlConstants } /** + * Checks if url belongs to a list of sites that aren't supposed to be + * crawled * + * @param string $url url to check + * @return bool whether is shouldn't be crawled */ function disallowedToCrawlSite($url) { @@ -1026,7 +1065,14 @@ class QueueServer implements CrawlConstants } /** + * Checks if the url belongs to one of the sites list in site_array + * Sites can be either given in the form domain:host or + * in the form of a url in which case it is check that the site url + * is a substring of the passed url. * + * @param string $url url to check + * @param array $site_array sites to check against + * @return bool whether the url belongs to one of the sites */ function urlMemberSiteArray($url, $site_array) { @@ -1049,7 +1095,9 @@ class QueueServer implements CrawlConstants } /** + * Gets a list of all the timestamps of previously stored crawls * + * @return array list of timestamps */ function getCrawlTimes() { diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php index ae6143797..04adb8cd3 100755 --- a/controllers/fetch_controller.php +++ b/controllers/fetch_controller.php @@ -67,6 +67,9 @@ class FetchController extends Controller implements CrawlConstants var $activities = array("schedule", "update", "crawlTime"); /** + * Checks that the request seems to be coming from a legitimate fetcher then + * determines which activity the fetcher is requesting and calls that + * activity for processing. * */ function processRequest() @@ -79,11 +82,12 @@ class FetchController extends Controller implements CrawlConstants if(!$this->checkRequest()) {return; } $activity = $_REQUEST['a']; - $this->$activity(); + if(in_array($activity, $this->activities)) {$this->$activity();} } /** - * + * Checks if there is a schedule of sites to crawl available and + * if so present it to the requesting fetcher, and then delete it. */ function schedule() { @@ -106,7 +110,8 @@ class FetchController extends Controller implements CrawlConstants } /** - * + * Processes Robot, To Crawl, and Index data sent from a fetcher + * Acknowledge to the fetcher if this data was received okay. */ function update() { @@ -144,10 +149,17 @@ class FetchController extends Controller implements CrawlConstants } /** - * @param &array $sites - * @param string $address - * @param string $day - * @param string $time + * Adds a file containing the seen sites and inverted index from the + * just received $sites array to the schedules folder's index directory's + * subfolder for the current crawl time. This file is added in a sub folder + * $day and its name contains the $time at which it arrived and the ip + * $address from which it arrived. This file will then be process later + * by the queue server. + * + * @param &array $sites a list of seen sites and an inverted inverted index + * @param string $address the IP address of the sending machine with . -->_ + * @param string $day timestamp in seconds converted to days + * @param string $time timestamp in seconds */ function addToIndexSchedules(&$sites, $address, $day, $time) { @@ -171,10 +183,19 @@ class FetchController extends Controller implements CrawlConstants } /** - * @param &array $sites - * @param string $address - * @param string $day - * @param string $time + * Adds a file containing the to-crawl sites from the just received + * $sites array to the schedules folder's schedule data directory's + * subfolder for the current crawl time. This file is added in a sub folder + * $day and its name contains the $time at which it arrived and the ip + * $address from which it arrived. This file will then be process later + * by the queue server. In addition to to-crawl sites the seen urls + * in $sites are also save in the file. They are used to perform a sanity + * check on the priority queue by the queue server. + * + * @param &array $sites a list of seen sites and to crawl sites + * @param string $address the IP address of the sending machine with . -->_ + * @param string $day timestamp in seconds converted to days + * @param string $time timestamp in seconds */ function addToCrawlSchedules(&$sites, $address, $day, $time) { @@ -207,10 +228,17 @@ class FetchController extends Controller implements CrawlConstants } /** - * @param &array $sites - * @param string $address - * @param string $day - * @param string $time + * Adds a file containing the robot site data from the just received + * $sites array to the schedules folder's robot data directory's + * subfolder for the current crawl time. This file is added in a sub folder + * $day and its name contains the $time at which it arrived and the ip + * $address from which it arrived. This file will then be process later + * by the queue server. + * + * @param &array $sites a list of seen sites and an inverted inverted index + * @param string $address the IP address of the sending machine with . -->_ + * @param string $day timestamp in seconds converted to days + * @param string $time timestamp in seconds */ function addRobotSchedules(&$sites, $address, $day, $time) { @@ -228,12 +256,14 @@ class FetchController extends Controller implements CrawlConstants /** + * Adds a file with contents $data and with name containing $address and + * $time to a subfolder $day of a folder $dir * - * @param string $dir - * @param &array $data - * @param string $address - * @param string $day - * @param string $time + * @param string $dir directory in which to add the schedule file + * @param &array $data data that the schedule file is to contain + * @param string $address the IP address of the sending machine with . -->_ + * @param string $day timestamp in seconds converted to days + * @param string $time timestamp in seconds */ function addScheduleToScheduleDirectory($dir, &$data, $address, $day, $time) { diff --git a/lib/hash_table.php b/lib/hash_table.php index 49474e0dd..6f16e89fe 100755 --- a/lib/hash_table.php +++ b/lib/hash_table.php @@ -34,9 +34,12 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * + * Loads the base class */ require_once "string_array.php"; +/** + * Needed for crawlHash + */ require_once "utility.php"; /** @@ -53,51 +56,60 @@ class HashTable extends StringArray { /** + * The size in bytes for keys stored in the hash table * * @var int */ var $key_size; /** + * The size in bytes of values associated with keys * * @var int */ var $value_size; /** - * + * Holds an all \0 string used of length $this->key_size * @var string */ var $null; /** - * + * Holds \0\0 followed by an all \FF string of length $this->key_size -1 + * Used to indicate that a slot once held data but that data was deleted. + * Such a slot tells a lookup to keep going, but on an insert can be + * overwritten in the inserted key is not already in the table * @var string */ var $deleted; /** - * + * Number of items currently in the hash table * @var int */ var $count; /** - * - * + * Flag for lookup methods */ const ALWAYS_RETURN_PROBE = 1; /** - * - * + * Flag for lookup methods */ const RETURN_PROBE_ON_KEY_FOUND = 0; /** - * - * + * Flag for lookup methods */ const RETURN_VALUE = -1; /** + * Makes a persistently stored (i.e., on disk and ram) hash table using the + * supplied parameters * - * + * @param string $fname filename to use when storing the hash table to disk + * @param int $num_values number of key value pairs the table can hold + * @param int $key_size number of bytes to store a hash table key + * @param int $value_size number of bytes to store a hash table value + * @param int $save_fequency how many non read operation before saving to + * disk */ function __construct($fname, $num_values, $key_size, $value_size, $save_frequency = self::DEFAULT_SAVE_FREQUENCY) @@ -114,8 +126,12 @@ class HashTable extends StringArray } /** + * Inserts the provided $key - $value pair into the hash table * - * + * @param string $key the key to use for the insert (will be needed for + * lookup) + * @param string $value the value associated with $key + * @return bool whether the insert was succesful or not */ function insert($key, $value) { @@ -162,8 +178,19 @@ class HashTable extends StringArray /** + * Tries to lookup the key in the hash table either return the + * location where it was found or the value associated with the key. * - * + * @param string $key key to look up in the hash table + * @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE, + * self::RETURN_PROBE_ON_KEY_FOUND, or self::RETURN_VALUE. Here + * value means the value associated with the key and probe is + * either the location in the array where the key was found or + * the first location in the array where it was determined the + * key could not be found. + * @return mixed would be string if the value is being returned, + * an int if the probe is being returned, and false if the key + * is not found */ function lookup($key, $return_probe_value = self::RETURN_VALUE) { @@ -172,8 +199,26 @@ class HashTable extends StringArray } /** + * Tries to lookup the key in the hash table either return the + * location where it was found or the value associated with the key. + * If the key is not at the initial probe value, linear search in the + * table is done. The values which cut-off the search are stored in + * $null_array. Using an array allows for flexibility since a deleted + * entry needs to be handled different when doing a lookup then when + * doing an insert. * - * + * @param string $key key to look up in the hash table + * @param array $null_array key values that would cut-off the search + * for key if the initial probe failed + * @param int $return_probe_value one of self::ALWAYS_RETURN_PROBE, + * self::RETURN_PROBE_ON_KEY_FOUND, or self::RETURN_VALUE. Here + * value means the value associated with the key and probe is + * either the location in the array where the key was found or + * the first location in the array where it was determined the + * key could not be found. + * @return mixed would be string if the value is being returned, + * an int if the probe is being returned, and false if the key + * is not found */ function lookupArray($key, $null_array, $return_probe_value = self::RETURN_VALUE) @@ -212,8 +257,10 @@ class HashTable extends StringArray } /** + * Deletes the data associated with the provided key from the hash table * - * + * @param string $key the key to delete the entry for + * @return bool whether or not something was deleted */ function delete($key) { @@ -234,8 +281,10 @@ class HashTable extends StringArray } /** + * Get the ith entry of the array for the hash table (no hashing here) * - * + * @param int $i an index of the hash table array + * @return array the key value pair stored at this index */ function getEntry($i) { @@ -247,8 +296,10 @@ class HashTable extends StringArray } /** + * Hashes the provided key to an index in the array of the hash table * - * + * @param string $key a key to hashed into the hash table + * @return int an index in the array of the hash table */ function hash($key) { diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index 4a1358dc3..b06780150 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -132,29 +132,79 @@ function setOffsetPointers($data, &$objects, $offset_field) */ class WordIterator implements IndexingConstants, CrawlConstants { + /** + * + */ var $word_key; + /** + * + */ var $index; + /** + * + */ var $seen_docs; + /** + * + */ var $restricted_seen_docs; + /** + * + */ var $count_block_unfiltered; + /** + * + */ var $num_docs; - + /** + * + */ var $next_offset; + /** + * + */ var $last_pointed_block; + /** + * + */ var $list_offset; - + /** + * + */ var $block_pointers; + /** + * + */ var $num_full_blocks; + /** + * + */ var $num_generations; + /** + * + */ var $last_block; + /** + * + */ var $info_block; + /** + * + */ var $current_pointer; + /** + * + */ var $limit; /** * + * @param string $word_key + * @param object $index + * @param int $limit + * @param object $info_block */ public function __construct($word_key, $index, $limit = 0, $info_block = NULL) { @@ -166,6 +216,7 @@ class WordIterator implements IndexingConstants, CrawlConstants /** * + * @param object $info_block */ public function reset($info_block = NULL) { @@ -215,6 +266,7 @@ class WordIterator implements IndexingConstants, CrawlConstants /** * + * @return bool */ public function initGeneration() { @@ -267,6 +319,8 @@ class WordIterator implements IndexingConstants, CrawlConstants /** * + * @param array $restrict_phrases + * @return array */ public function currentDocsWithWord($restrict_phrases = NULL) { @@ -389,6 +443,8 @@ class WordIterator implements IndexingConstants, CrawlConstants /** * + * @param array $restrict_phrases + * @return array */ public function nextDocsWithWord($restrict_phrases = NULL) { @@ -444,6 +500,11 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param string $dir_name + * @param int $filter_size + * @param int $num_partitions_summaries + * @param int $num_parititions_index + * @param string $description */ public function __construct($dir_name, $filter_size = -1, $num_partitions_summaries = NULL, $num_partitions_index = NULL, @@ -486,6 +547,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param string $key_field + * @param string $offset_field + * @param array $pages + * @return array */ public function addPages($key_field, $offset_field, $pages) { @@ -496,6 +561,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param array $index_data */ public function addIndexData($index_data) { @@ -549,6 +615,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param int $partition + * @param array &$word_data + * @param bool $overwrite */ public function addPartitionWordData($partition, &$word_data, $overwrite = false) @@ -665,6 +734,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param int $partition + * @param string $word_key + * @return bool */ public function addPartitionIndexFilter($partition, $word_key) { @@ -681,6 +753,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param int $partition + * @return bool */ public function initPartitionIndexFilter($partition) { @@ -704,6 +778,13 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param string $word_key + * @param int $Lint + * @param int $num + * @param array $restrict_phrases + * @param string $phrase_key + * @param array $phrase_info + * @return array */ public function getSummariesByHash($word_key, $limit, $num, $restrict_phrases = NULL, $phrase_key = NULL, $phrase_info = NULL) @@ -751,6 +832,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param string $key + * @param int $offset + * @return array */ public function getPage($key, $offset) { @@ -759,6 +843,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param string $word_key + * @param int $offset + * @param int $generation + * @return array */ public function getWordDocBlock($word_key, $offset, $generation = -1) { @@ -773,6 +861,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param int $partition + * @param int $offset + * @param resource $file_handle + * @return array */ public function getPageByPartition($partition, $offset, $file_handle = NULL) { @@ -782,6 +874,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param string $key_field + * @param array $page */ public function addPageFilter($key_field, $page) { @@ -813,6 +907,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param string $phrase_key + * @param int $generation_index + * @param array $info_block + * @return array */ public function getPhraseIndexInfo( $phrase_key, $generation_index = 0, $info_block = NULL) @@ -889,6 +987,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param string $phrase_key + * @param array $info */ public function setPhraseIndexInfo($phrase_key, $info) { @@ -909,6 +1009,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param string $word_key + * @param array $restrict_phrases + * @param string $phrase_key + * @param $num_needed */ public function addPhraseIndex($word_key, $restrict_phrases, $phrase_key, $num_needed) @@ -979,6 +1083,10 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param array $word_keys + * @param int $num + * @param string $comparison + * @return array */ public function getSelectiveWords($word_keys, $num, $comparison="lessThan") //lessThan is in utility.php @@ -1002,6 +1110,9 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param int $partition + * @param int $generation + * @return array */ public function readPartitionInfoBlock($partition, $generation = -1) { @@ -1017,6 +1128,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants /** * + * @param int $partition + * @param array $data */ public function writePartitionInfoBlock($partition, $data) { diff --git a/lib/priority_queue.php b/lib/priority_queue.php index b50468948..763863c3d 100755 --- a/lib/priority_queue.php +++ b/lib/priority_queue.php @@ -38,15 +38,15 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} */ require_once "string_array.php"; /** - * + * A Notifier is called when data in the queue is move around */ require_once "notifier.php"; /** - * + * Loaded for crawlLog function */ require_once "utility.php"; /** - * + * Constants shared amoung classes involved in storing web crawl data */ require_once "crawl_constants.php"; @@ -63,40 +63,50 @@ require_once "crawl_constants.php"; class PriorityQueue extends StringArray implements CrawlConstants { /** - * - * + * Number of values that can be stored in the priority queue + * @var int */ var $num_values; /** - * - * + * Number of bytes needed to store a value associated with a weight + * @var int */ var $value_size; /** - * - * + * Number of bytes needed to store a weight in the queue + * @var int */ var $weight_size = 4; //size of a float /** - * - * + * Number of items that are currently stored in the queue + * @var int */ var $count; /** - * - * + * When the polling the queue returns the least or most weighted value + * @var string */ var $min_or_max; /** - * - * + * An object that implements the Notifier interface (for instance, + * WebQueueArchive) + * @var object */ var $notifier; // who to call if move an item in queue /** + * Makes a priority queue with the given operating parameters * + * @param string $fname filename to store the data associated with the queue + * @param int $num_values number of values the queue can hold + * @param int $value_size the size in a bytes of a value + * @param string $min_or_max whether this priority queue return least or + * most weight values when polled + * @param object $notifier object to call when a value changes in the queue + * @param int $save_frequency how often the data in the queue should be + * save to disk. (It's default location is RAM) */ function __construct($fname, $num_values, $value_size, $min_or_max, $notifier = NULL, @@ -116,7 +126,11 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * Gets the data stored at the ith location in the priority queue * + * @param int $i location to return data from + * @return mixed data if the value of $i is between 1 and count, false + * otherwise */ function peek($i = 1) { @@ -129,6 +143,8 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * + * @param int $i + * @return mixed */ function poll($i = 1) { @@ -149,7 +165,9 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** - * + * @param string $data + * @param float $weight + * @return mixed */ function insert($data, $weight) { @@ -168,6 +186,8 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * + * @param int $i + * @param float $delta */ function adjustWeight($i, $delta) { @@ -209,6 +229,7 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * + * @return array */ function getContents() { @@ -222,6 +243,7 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * + * @param int $new_total */ function normalize($new_total = NUM_URLS_QUEUE_RAM) { @@ -246,6 +268,8 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * + * @param int $i + * @return int */ function percolateUp($i) { @@ -274,6 +298,8 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * + * @param int $i + * @return int */ function percolateDown($i) { @@ -314,6 +340,9 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * + * @param float $value1 + * @param float $value2 + * @return float */ function compare($value1, $value2) { @@ -326,6 +355,9 @@ class PriorityQueue extends StringArray implements CrawlConstants /** * + * + * @param int $i + * @return array */ function getRow($i) { @@ -345,7 +377,13 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * Add data to the $i row of the priority queue viewed as an array + * Calls the notifier associated with this queue about the change + * in data's location * + * @param int $i location to add data + * @param array $row data to add (a two element array in the form + * key, float value). */ function putRow($i, $row) { @@ -359,7 +397,9 @@ class PriorityQueue extends StringArray implements CrawlConstants } /** + * Computes and returns the weight of all items in prority queue * + * @return float weight of all items stored in the priority queue */ function totalWeight() { diff --git a/lib/processors/doc_processor.php b/lib/processors/doc_processor.php index 1260db9e9..bd57b603a 100755 --- a/lib/processors/doc_processor.php +++ b/lib/processors/doc_processor.php @@ -34,7 +34,7 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * + * Load the base class */ require_once BASE_DIR."/lib/processors/text_processor.php"; diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index e24e3ad03..ec7930e8e 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -34,9 +34,12 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * Load base class, if needed. We also might need to parse urls + * Load base class, if needed. */ require_once BASE_DIR."/lib/processors/text_processor.php"; +/** + * Load so can parse urls + */ require_once BASE_DIR."/lib/url_parser.php"; /** diff --git a/lib/processors/pdf_processor.php b/lib/processors/pdf_processor.php index d83df545e..b8e6fb415 100755 --- a/lib/processors/pdf_processor.php +++ b/lib/processors/pdf_processor.php @@ -79,7 +79,10 @@ class PdfProcessor extends TextProcessor } /** + * Gets the text out of a PDF document * + * @param string $pdf_string a string representing the PDF document + * @return string text extracted from the document */ static function getText($pdf_string) { $len = strlen($pdf_string); @@ -128,7 +131,12 @@ class PdfProcessor extends TextProcessor } /** + * Gets between an obj and endobj tag at the current position in a PDF + * document * + * @param string $pdf_string astring of a PDF document + * @param int $cur_pos a integer postion in that string + * @return string the contents of the PDF object located at $cur_pos */ static function getNextObject($pdf_string, $cur_pos) { @@ -137,11 +145,13 @@ class PdfProcessor extends TextProcessor /** * + * @param string $object_dictionary + * @param array $type_array */ - static function objectDictionaryHas($object_Dictionary, $type_array) + static function objectDictionaryHas($object_dictionary, $type_array) { foreach ($type_array as $type) { - if(strstr($object_Dictionary, $type)) { + if(strstr($object_dictionary, $type)) { return true; } } @@ -151,6 +161,8 @@ class PdfProcessor extends TextProcessor /** * + * @param string $object_string + * @return string */ static function getObjectDictionary($object_string) { @@ -161,6 +173,8 @@ class PdfProcessor extends TextProcessor /** * + * @param string $object_stream + * @return string */ static function getObjectStream($object_string) { @@ -172,6 +186,8 @@ class PdfProcessor extends TextProcessor /** * + * @param string $data + * @return string */ static function parseText($data) { @@ -212,6 +228,9 @@ class PdfProcessor extends TextProcessor /** * + * @param string $data + * @param int $cur_pos + * @return array */ static function parseBrackets($data, $cur_pos) { @@ -250,6 +269,9 @@ class PdfProcessor extends TextProcessor /** * + * @param string $data + * @param int $cur_pos + * @return array */ static function parseParentheses($data, $cur_pos) { diff --git a/lib/processors/ppt_processor.php b/lib/processors/ppt_processor.php index e49d05ada..0d72ea581 100755 --- a/lib/processors/ppt_processor.php +++ b/lib/processors/ppt_processor.php @@ -39,7 +39,7 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} require_once BASE_DIR."/lib/processors/text_processor.php"; - /** +/** * Used to create crawl summary information * for PPT files * @@ -58,7 +58,21 @@ class PptProcessor extends TextProcessor const ALWAYS_IGNORE = 6; /** + * Computes a summary based on a string of a binary Powerpoint document + * (as opposed to the modern xml powerpoint format). + * + * Text is extracted from the Powerpoint document using a crude finite + * state machine that was developed by looking at a few Powerpoint + * documents in a Hex editor. Then the TextProcessor:: process() method + * is used to make a summary * + * @param string $page string of a Powerpoint document + * @param string $url location the document came from, not used by + * TextProcessor at this point. Some of its subclasses override + * this method and use url to produce complete links for + * relative links within a document + * @return array a summary of (title, description,links, and content) of + * the information in $page */ public static function process($page, $url) { diff --git a/lib/processors/rtf_processor.php b/lib/processors/rtf_processor.php index 16e504292..96994dfb7 100755 --- a/lib/processors/rtf_processor.php +++ b/lib/processors/rtf_processor.php @@ -50,7 +50,9 @@ class RtfProcessor extends TextProcessor { /** - * + * @param string $page + * @param string $url + * @return array */ public static function process($page, $url) { @@ -71,6 +73,8 @@ class RtfProcessor extends TextProcessor /** * + * @param string $rtf_string + * @return string */ static function extractText($rtf_string) { $rtf_string = preg_replace('/\\\{/',"!ZZBL!", $rtf_string); @@ -89,6 +93,8 @@ class RtfProcessor extends TextProcessor /** * + * @param string $rtf_string + * @return string */ static function getText($rtf_string) { @@ -125,6 +131,9 @@ class RtfProcessor extends TextProcessor /** * + * @param string $rtf_string + * @param int $cur_pos + * @return string */ static function getNextObject($rtf_string, $cur_pos) { diff --git a/lib/processors/text_processor.php b/lib/processors/text_processor.php index 8773576d3..ad518840c 100755 --- a/lib/processors/text_processor.php +++ b/lib/processors/text_processor.php @@ -34,7 +34,7 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * + * Loads in the common constant used by all classes related to crawling */ require_once BASE_DIR."/lib/crawl_constants.php"; @@ -51,7 +51,15 @@ class TextProcessor implements CrawlConstants { /** + * Computes a summary based on a text string of a document * + * @param string $page text string of a document + * @param string $url location the document came from, not used by + * TextProcessor at this point. Some of its subclasses override + * this method and use url to produce complete links for + * relative links within a document + * @return array a summary of (title, description,links, and content) of + * the information in $page */ static function process($page, $url) { @@ -65,7 +73,15 @@ class TextProcessor implements CrawlConstants } /** + * Gets the text between two tags in a document starting at the current + * position. * + * @param string $string document to extract text from + * @param int $cur_pos current location to look if can extract text + * @param string $start_tag starting tag that we want to extract after + * @param string $end_tag ending tag that we want to extract until + * @return array pair consisting of when in the document we are after + * the end tag, together with the data between the two tags */ static function getBetweenTags($string, $cur_pos, $start_tag, $end_tag) { diff --git a/lib/web_archive.php b/lib/web_archive.php index b41a00dc7..85ffea504 100755 --- a/lib/web_archive.php +++ b/lib/web_archive.php @@ -81,6 +81,9 @@ class WebArchive const CLOSE = 3; /** * + * @param string $fname + * @param string $compressor + * @param bool $fast_construct */ function __construct($fname, $compressor, $fast_construct = false) { @@ -102,6 +105,7 @@ class WebArchive /** * + * @return array */ function readInfoBlock() { @@ -120,6 +124,8 @@ class WebArchive /** * + * @param resource $fh + * @param array &$data */ function writeInfoBlock($fh = NULL, &$data = NULL) { @@ -148,6 +154,8 @@ class WebArchive /** * + * @param resource $fh + * @return int */ function seekEndObjects($fh) { @@ -161,6 +169,12 @@ class WebArchive /** * + * @param string $offset_field + * @param array &$objects + * @param array $data + * @param string $callback + * @param bool $return_flag + * @return mixed */ function addObjects($offset_field, &$objects, $data = NULL, $callback = NULL, $return_flag = true) @@ -211,6 +225,8 @@ class WebArchive /** * + * @param string $mode + * @return resource */ function open($mode = "r") { @@ -228,6 +244,11 @@ class WebArchive /** * + * @param int $offset + * @param int $num + * @param bool $next_flag + * @param resource $fh + * @return array */ function getObjects($offset, $num, $next_flag = true, $fh = NULL) { diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php index ff7fac6d4..9f77ca1e8 100755 --- a/lib/web_archive_bundle.php +++ b/lib/web_archive_bundle.php @@ -103,6 +103,11 @@ class WebArchiveBundle /** * + * @param string $dir_name + * @param int $filter_size + * @param int $num_partitions + * @param string $description + * @param string $compressor */ function __construct($dir_name, $filter_size = -1, $num_partitions = NULL, $description = NULL, @@ -172,6 +177,10 @@ class WebArchiveBundle /** * + * @param string $key_field + * @param string $offset_field + * @param array &$pages + * @return array */ function addPages($key_field, $offset_field, &$pages) { @@ -214,6 +223,9 @@ class WebArchiveBundle /** * + * @param string $key + * @param int $offset + * @return array */ function getPage($key, $offset) { @@ -225,6 +237,10 @@ class WebArchiveBundle /** * + * @param int $partition + * @param int $offset + * @param resource $file_handle + * @return array */ function getPageByPartition($partition, $offset, $file_handle = NULL) { @@ -241,6 +257,9 @@ class WebArchiveBundle /** * + * @param string $key_field + * @param array &$page + * @return bool */ function addPageFilter($key_field, &$page) { @@ -254,6 +273,13 @@ class WebArchiveBundle /** * + * @param string $offset_field + * @param int $partition + * @param array &$objects + * @param array $data + * @param string $callback + * @param bool $return_flag + * @return mixed */ function addObjectsPartition($offset_field, $partition, &$objects, $data = NULL, $callback = NULL, $return_flag = true) @@ -267,6 +293,8 @@ class WebArchiveBundle /** * + * @param int $partition + * @return array */ function readPartitionInfoBlock($partition) { @@ -275,6 +303,8 @@ class WebArchiveBundle /** * + * @param int $partition + * @param array $data */ function writePartitionInfoBlock($partition, &$data) { @@ -283,6 +313,9 @@ class WebArchiveBundle /** * + * @param array $pages + * @param string $key_field + * @return mixed */ function differencePageKeysFilter($pages, $key_field) { @@ -302,6 +335,8 @@ class WebArchiveBundle /** * + * @param array &$page_array + * @param string $field_name */ function differencePagesFilter(&$page_array, $field_name = NULL) { @@ -321,6 +356,8 @@ class WebArchiveBundle /** * + * @param int $index + * @param bool $fast_construct */ function getPartition($index, $fast_construct = true) { @@ -343,6 +380,7 @@ class WebArchiveBundle /** * + * @param int $num */ function addCount($num) { @@ -354,6 +392,8 @@ class WebArchiveBundle /** * + * @param string $dir_name + * @return array */ static function getArchiveInfo($dir_name) { @@ -374,6 +414,9 @@ class WebArchiveBundle /** * + * @param string $value + * @param int $num_partitions + * @return int */ static function selectPartition($value, $num_partitions) { diff --git a/lib/web_queue_bundle.php b/lib/web_queue_bundle.php index f64f21f67..20eb54ef3 100755 --- a/lib/web_queue_bundle.php +++ b/lib/web_queue_bundle.php @@ -145,6 +145,10 @@ class WebQueueBundle implements Notifier /** * + * @param string $dir_name + * @param int $filter_size + * @param int $num_urls_ram + * @param string $min_or_max */ function __construct($dir_name, $filter_size, $num_urls_ram, $min_or_max) @@ -231,6 +235,7 @@ class WebQueueBundle implements Notifier /** * + * @param array $url_paris */ function addUrlsQueue(&$url_pairs) { @@ -273,6 +278,7 @@ class WebQueueBundle implements Notifier /** * + * @param string $url */ function containsUrlQueue(&$url) { @@ -283,6 +289,8 @@ class WebQueueBundle implements Notifier /** * + * @param string $url + * @param float $delta */ function adjustQueueWeight(&$url, $delta) { @@ -301,6 +309,7 @@ class WebQueueBundle implements Notifier /** * + * @param string $url */ function removeQueue($url) { @@ -323,6 +332,9 @@ class WebQueueBundle implements Notifier /** * + * @param int $i + * @param resource $fh + * @return mixed */ function peekQueue($i = 1, $fh = NULL) { @@ -365,6 +377,10 @@ class WebQueueBundle implements Notifier } } + /** + * + * @return array + */ function getContents() { $count = $this->to_crawl_queue->count; @@ -377,6 +393,7 @@ class WebQueueBundle implements Notifier /** * + * @param int $new_total */ function normalize($new_total = NUM_URLS_QUEUE_RAM) { @@ -387,6 +404,8 @@ class WebQueueBundle implements Notifier /** * + * @param string $mode + * @return resource */ function openUrlArchive($mode = "r") { @@ -395,6 +414,7 @@ class WebQueueBundle implements Notifier /** * + * @param resource $fh */ function closeUrlArchive($fh) { @@ -403,6 +423,7 @@ class WebQueueBundle implements Notifier /** * + * @param string $url */ function addSeenUrlFilter($url) { @@ -411,6 +432,8 @@ class WebQueueBundle implements Notifier /** * + * @param array &$url_array + * @param string $field_name */ function differenceSeenUrls(&$url_array, $field_name = NULL) { @@ -420,6 +443,7 @@ class WebQueueBundle implements Notifier /** * + * @param string $host */ function addGotRobotTxtFilter($host) { @@ -428,6 +452,8 @@ class WebQueueBundle implements Notifier /** * + * @param string $host + * @return bool */ function containsGotRobotTxt($host) { @@ -436,6 +462,7 @@ class WebQueueBundle implements Notifier /** * + * @param string $host */ function addDisallowedRobotFilter($host) { @@ -444,6 +471,8 @@ class WebQueueBundle implements Notifier /** * + * @param string $host_path + * @return bool */ function containsDisallowedRobot($host_path) { @@ -452,6 +481,7 @@ class WebQueueBundle implements Notifier /** * + * @return int */ function getRobotTxtAge() { @@ -464,6 +494,8 @@ class WebQueueBundle implements Notifier /** * + * @param string $host + * @param int $value */ function setCrawlDelay($host, $value) { @@ -480,6 +512,8 @@ class WebQueueBundle implements Notifier /** * + * @param string $host + * @return int */ function getCrawlDelay($host) { @@ -499,6 +533,9 @@ class WebQueueBundle implements Notifier /** * + * @param string $name + * @param int $num_values + * @return object */ function constructHashTable($name, $num_values) { @@ -509,6 +546,8 @@ class WebQueueBundle implements Notifier /** * + * @param string $key + * @return string */ function lookupHashTable($key) { @@ -517,6 +556,7 @@ class WebQueueBundle implements Notifier /** * + * @param string $value */ function deleteHashTable($value) { @@ -529,6 +569,9 @@ class WebQueueBundle implements Notifier /** * + * @param string $key + * @param string $value + * @return bool */ function insertHashTable($key, $value) { @@ -616,6 +659,7 @@ class WebQueueBundle implements Notifier } /** + * * */ function emptyRobotFilters() @@ -645,6 +689,8 @@ class WebQueueBundle implements Notifier /** * + * @param int $index + * @param array $data */ function notify($index, $data) {