diff --git a/bin/fetcher.php b/bin/fetcher.php index e633f1690..bdae4bbb5 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -31,6 +31,7 @@ * @filesource */ +/** Calculate base directory of script */ define("BASE_DIR", substr($_SERVER['DOCUMENT_ROOT'].$_SERVER['PWD'].$_SERVER["SCRIPT_NAME"], 0, -strlen("bin/fetcher.php"))); diff --git a/bin/queue_server.php b/bin/queue_server.php index 39a158124..9b76ad89d 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -31,6 +31,7 @@ * @filesource */ +/** Calculate base directory of script */ define("BASE_DIR", substr($_SERVER['DOCUMENT_ROOT']. $_SERVER['PWD'].$_SERVER["SCRIPT_NAME"], 0, -strlen("bin/queue_server.php"))); diff --git a/bot.php b/bot.php index c0938914f..0398ed170 100755 --- a/bot.php +++ b/bot.php @@ -34,6 +34,7 @@ * @filesource */ +/** Calculate base directory of script */ define("BASE_DIR", substr($_SERVER['DOCUMENT_ROOT'].$_SERVER['PWD']. $_SERVER["SCRIPT_NAME"], 0, -strlen("bot.php"))); diff --git a/configs/config.php b/configs/config.php index 8a2a9c778..ec23c5acd 100755 --- a/configs/config.php +++ b/configs/config.php @@ -35,8 +35,11 @@ * @filesource */ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} +/** bit of DEBUG_LEVEL used to indicate test cases should be displayable*/ define('TEST_INFO', 1); +/** bit of DEBUG_LEVEL used to indicate query statistics should be displayed*/ define('QUERY_INFO', 2); +/** bit of DEBUG_LEVEL used to indicate php messages should be displayed*/ define('ERROR_INFO', 4); date_default_timezone_set('America/Los_Angeles'); @@ -84,105 +87,125 @@ if((DEBUG_LEVEL & ERROR_INFO) == ERROR_INFO) { error_reporting(0); } -if( (DEBUG_LEVEL & TEST_INFO) == TEST_INFO) { - define('DISPLAY_TESTS', true); -} else { - define('DISPLAY_TESTS', false); -} +/** if true tests are diplayable*/ +define('DISPLAY_TESTS', ((DEBUG_LEVEL & TEST_INFO) == TEST_INFO)); -if( (DEBUG_LEVEL & QUERY_INFO) == QUERY_INFO) { - define('QUERY_STATISTICS', true); -} else { - define('QUERY_STATISTICS', false); -} +/** if true query statistics are diplayed */ +define('QUERY_STATISTICS', ((DEBUG_LEVEL & QUERY_INFO) == QUERY_INFO)); if(!PROFILE) { return; } /*+++ End machine generated code, feel free to edit the below as desired +++*/ +/** this is the User-Agent names the crawler provides + * a web-server it is crawling + */ define('USER_AGENT', 'Mozilla/5.0 (compatible; '.USER_AGENT_SHORT.' +'.QUEUE_SERVER.'bot.php)'); - /* this is the User-Agent names the crawler provides - a web-server it is crawling - */ + +/** name of the cookie used to manage the session + * (store language and perpage settings) + */ define ('SESSION_NAME', "yioopbiscuit"); - /* name of the cookie used to manage the session - (store language and perpage settings) - */ +/** maximum size of a log file before it is rotated */ define("MAX_LOG_FILE_SIZE", 5000000); - // maximum size of a log file before it is rotated + +/** number of log files to rotate amongst */ define("NUMBER_OF_LOG_FILES", 5); - // number of log files to rotate amongst +/** + * how long in seconds to keep a cache of a robot.txt + * file before re-requesting it + */ define('CACHE_ROBOT_TXT_TIME', 86400); - /* how long in seconds to keep a cache of a robot.txt - file before re-requesting it - */ + +/** + * if the robots.txt has a Crawl-delay larger than this + * value don't crawl the site. + * maximum value for this is 255 + */ define('MAXIMUM_CRAWL_DELAY', 64); - /* if the robots.txt has a Crawl-delay larger than this - value don't crawl the site. - maximum value for this is 255 - */ -define('MAX_WAITING_HOSTS', 1000); - //maximum number of active crawl-delayed hosts +/** maximum number of active crawl-delayed hosts */ +define('MAX_WAITING_HOSTS', 1000); +/** + * bloom filters are used to keep track of which urls are visited, + * this parameter determines up to how many + * urls will be stored in a single filter. Additional filters are + * read to and from disk. + */ define('URL_FILTER_SIZE', 10000000); - /* bloom filters are used to keep track of which urls are visited, - this parameter determines up to how many - urls will be stored in a single filter. Additional filters are - read to and from disk. - */ + +/** number of fetchers that will be used in a given crawl */ define('NUM_FETCHERS', 3); - // number of fetchers that will be used in a given crawl + +/** + * maximum number of urls that will be held in ram + * (as opposed to in files) in the priority queue + */ define('NUM_URLS_QUEUE_RAM', 300000); - /* maximum number of urls that will be held in ram - (as opposed to in files) in the priority queue - */ +/** Minimum weight in priority queue before rebuilt*/ define('MIN_QUEUE_WEIGHT', 1/100000); -define('NUM_ARCHIVE_PARTITIONS', 10); - // number of web archive files to use to store web pages in + +/** number of web archive files to use to store web pages in */ +define('NUM_ARCHIVE_PARTITIONS', 10); + +/** + * number of web archive files to use for the inverted index of + * word->docs in a given generation + */ define('NUM_INDEX_PARTITIONS', 250); - /* number of web archive files to use for the inverted index of - word->docs in a given generation - */ -define('NUM_WORDS_PER_GENERATION', 6*URL_FILTER_SIZE/NUM_INDEX_PARTITIONS); - // number of words before next gen -define('SAMPLE_GENERATIONS', 3); - // number of generations to sample in estimating number of urls in a query +/** number of words before next gen */ +define('NUM_WORDS_PER_GENERATION', 6*URL_FILTER_SIZE/NUM_INDEX_PARTITIONS); +/** number of generations to sample in estimating number of urls in a query */ +define('SAMPLE_GENERATIONS', 3); +/** store inlink data in word inverted index */ define('STORE_INLINKS_IN_DICTIONARY', false); - //store inlink data in word inverted index + +/** precision to round floating points document scores */ define('PRECISION', 10); - // precision to round floating points document scores + +/** + * when index data from relatively uncommon words, + * how many docs should be grouped together in a block + */ define('BLOCK_SIZE', 50); - /* when index data from relatively uncommon words, - how many docs should be grouped together in a block - */ + +/** how many documents a word needs to be to get its own index file. */ define('COMMON_WORD_THRESHOLD', 1000); - // how many documents a word needs to be to get its own index file. +/** maximum number of links to consider on any given page */ define('MAX_LINKS_PER_PAGE', 50); - // maximum number of links to consider on any given page + +/** maximum number of words from links to consider on any given page */ define('MAX_LINKS_WORD_TEXT', 200); - // maximum number of words from links to consider on any given page -define('PAGE_RANGE_REQUEST', 50000); // request this many bytes out of a page -define('MAX_PHRASE_LEN', 2); //maximum length +1 exact phrase matches +/** request this many bytes out of a page */ +define('PAGE_RANGE_REQUEST', 50000); + +/** maximum length +1 exact phrase matches */ +define('MAX_PHRASE_LEN', 2); + +/** number of multi curl page requests in one go */ define('NUM_MULTI_CURL_PAGES', 100); - //number of multi curl page requests in one go + +/** time in seconds before we give up on a page */ define('PAGE_TIMEOUT', 30); - //time in seconds before we give up on a page +/** how often should we make in OPIC the sum of weights totals MAX_URLS */ define('NORMALIZE_FREQUENCY', 10000); - // how often should we make in OPIC the sum of weights totals MAX_URLS - +/** + * @global array file extensions which can be handled by the search engine, + * other extensions will be ignored + */ $INDEXED_FILE_TYPES = array( "html", "htm", @@ -207,6 +230,10 @@ $INDEXED_FILE_TYPES = "gif", "png"); +/** + * @global array associates mimetypes that can be processed by the search + * engine with the processor class that can process them + */ $PAGE_PROCESSORS = array( "text/html" => "HtmlProcessor", "text/asp" => "HtmlProcessor", @@ -229,19 +256,21 @@ $PAGE_PROCESSORS = array( "text/html" => "HtmlProcessor", - +/** + * How many non robot urls the fetcher successfully downloads before + * between times data sent back to queue server + */ define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', 500); -define ('MAX_FETCH_SIZE', 5000); - //maximum number of urls to schedule to a given fetcher in one go + +/** maximum number of urls to schedule to a given fetcher in one go */ +define ('MAX_FETCH_SIZE', 5000); + +/** fetcher must wait at least this long between multi-curl requests */ define ('MINIMUM_FETCH_LOOP_TIME', 5); - //fetcher must wait at least this long between multi-curl requests -/* - * searching and admin - */ +/** default number of search results to display per page */ define ('NUM_RESULTS_PER_PAGE', 10); - //default number of search results to display per page +/** Number of recently crawled urls to display on admin screen */ define ('NUM_RECENT_URLS_TO_DISPLAY', 10); - // Number of recently crawled urls to display on admin screen ?> diff --git a/controllers/search_controller.php b/controllers/search_controller.php index cc2668737..c3e67715e 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -154,8 +154,14 @@ class SearchController extends Controller implements CrawlConstants } } + $token_okay = $this->checkCSRFToken('YIOOP_TOKEN', $user); + if($token_okay === false) { + unset($_SESSION['USER_ID']); + $user = $_SERVER['REMOTE_ADDR']; + } $data['YIOOP_TOKEN'] = $this->generateCSRFToken($user); + $data['ELAPSED_TIME'] = changeInMicrotime($start_time); $this->displayView($view, $data); } @@ -172,9 +178,9 @@ class SearchController extends Controller implements CrawlConstants * argument provides auxiliary information on how to conduct the * search. For instance on a related web page search, it might provide * the url of the site with which to perform the related search. - * @param int $results_per_page the maixmum number of search results + * @param int $results_per_page the maixmum number of search results * that can occur on a page - * @return array an array of at most results_per_page many search results + * @return array an array of at most results_per_page many search results */ function processQuery($query, $activity, $arg, $results_per_page) { @@ -236,7 +242,7 @@ class SearchController extends Controller implements CrawlConstants * This method parses the raw query string for query activities. * It parses the name of each activity and its argument * - * @return array list of search activities parsed out of the search string + * @return array list of search activities parsed out of the search string */ function extractActivityQuery() { diff --git a/index.php b/index.php index 2bfa007b2..f2e6e3093 100755 --- a/index.php +++ b/index.php @@ -35,6 +35,7 @@ * @filesource */ +/** Calculate base directory of script */ define("BASE_DIR", substr($_SERVER['DOCUMENT_ROOT'].$_SERVER['PWD']. $_SERVER["SCRIPT_NAME"], 0, -strlen("index.php"))); diff --git a/lib/bloom_filter_bundle.php b/lib/bloom_filter_bundle.php index 16d7cc5a7..6710bd8d9 100644 --- a/lib/bloom_filter_bundle.php +++ b/lib/bloom_filter_bundle.php @@ -47,12 +47,34 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} class BloomFilterBundle { + /** + * Reference to the filter which will be used to store new data + * @var object + */ var $current_filter; + /** + * Total number of filter that this filter bundle currently has + * @var int + */ var $num_filters; + /** + * The number of items which have been stored in the current filter + * @var int + */ var $current_filter_count; + /** + * The maximum capacity of a filter in this filter bundle + * @var int + */ var $filter_size; + /** + * The folder name of this filter bundle + * @var string + */ var $dir_name; - + /** + * The default maximum size of a filter in a filter bundle + */ const default_filter_size = 10000000; /** @@ -156,7 +178,8 @@ class BloomFilterBundle } /** - * + * Saves the meta data (number of filter, number of items stored, and size) + * of the bundle */ public function saveMetaData() { @@ -169,7 +192,7 @@ class BloomFilterBundle } /** - * + * Used to save to disk all the file data associated with this bundle */ public function forceSave() { diff --git a/lib/fetch_url.php b/lib/fetch_url.php index 0899b9d06..738201f4f 100755 --- a/lib/fetch_url.php +++ b/lib/fetch_url.php @@ -33,6 +33,11 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} +/** + * Reads in constants used as enums used for storing web sites + */ +require_once BASE_DIR."/lib/crawl_constants.php"; + /** * * Code used to manage HTTP requests from one or more URLS @@ -41,10 +46,7 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} * * @package seek_quarry * @subpackage library - */ - -require_once BASE_DIR."/lib/crawl_constants.php"; - + */ class FetchUrl implements CrawlConstants { diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php index 3c4d9142a..7cc49d41f 100755 --- a/lib/phrase_parser.php +++ b/lib/phrase_parser.php @@ -36,9 +36,13 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** * load the stem word function, if necessary */ - require_once BASE_DIR."/lib/porter_stemmer.php"; +/** + * Reads in constants used as enums used for storing web sites + */ +require_once BASE_DIR."/lib/crawl_constants.php"; + /** * library of functions used to manipulate words and phrases * @@ -48,8 +52,6 @@ require_once BASE_DIR."/lib/porter_stemmer.php"; * @package seek_quarry * @subpackage librarys */ -require_once BASE_DIR."/lib/crawl_constants.php"; - class PhraseParser { /** diff --git a/lib/porter_stemmer.php b/lib/porter_stemmer.php index 8e953dd36..74f6d3f99 100755 --- a/lib/porter_stemmer.php +++ b/lib/porter_stemmer.php @@ -71,8 +71,9 @@ class PorterStemmer } /** - * Checks to see if the ith character in the buffer is a consonant + * Checks to see if the ith character in the buffer is a consonant * + * @param int $i the character to check */ private static function cons($i) { @@ -97,8 +98,6 @@ class PorterStemmer * <c>vcvcvc<v> gives 3 * .... */ - - private static function m() { $n = 0; @@ -270,7 +269,6 @@ class PorterStemmer /* step2() maps double suffices to single ones. so -ization ( = -ize plus -ation) maps to -ize etc.Note that the string before the suffix must give m() > 0. */ - private static function step2() { if(self::$k < 1) return; diff --git a/lib/string_array.php b/lib/string_array.php index 4355753bf..7d61ec1c5 100755 --- a/lib/string_array.php +++ b/lib/string_array.php @@ -51,10 +51,25 @@ require_once "persistent_structure.php"; class StringArray extends PersistentStructure { + /** + * + */ var $filename; + /** + * + */ var $num_values; + /** + * + */ var $array_size; + /** + * + */ var $data_size; + /** + * + */ var $string_array; diff --git a/lib/unit_test.php b/lib/unit_test.php index 8dcb2dea2..0270714ed 100644 --- a/lib/unit_test.php +++ b/lib/unit_test.php @@ -42,20 +42,31 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} */ abstract class UnitTest { + + /** + * Used to store the results for each test sub case + */ var $test_case_results; + /** + * Used to hold objects to be used in tests + * @var array + */ var $test_objects; - + /** + * The suffix that all TestCase methods need to have to be called by run() + */ const case_name = "TestCase"; - const case_name_len = 8; /** - * + * Contructor should be overriden to do any set up that occurs before + * and test cases */ public function __construct() { } /** - * + * Execute each of the test cases of this unit test and return the results + * @return array test case results */ public function run() { @@ -68,7 +79,7 @@ abstract class UnitTest $len = strlen($method); if(substr_compare( - $method, self::case_name, $len - self::case_name_len) == 0) { + $method, self::case_name, $len - strlen(self::case_name)) == 0){ $this->test_case_results = array(); $this->$method(); $test_results[$method] = $this->test_case_results; @@ -80,7 +91,11 @@ abstract class UnitTest } /** - * + * Checks that $x can coerced to true, the result of the + * test is added to $this->test_case_results + * + * @param mixed $x item to check + * @param string $description information about this test subcase */ public function assertTrue($x, $description = "") { @@ -96,7 +111,11 @@ abstract class UnitTest } /** - * + * Checks that $x can coerced to false, the result of the + * test is added to $this->test_case_results + * + * @param mixed $x item to check + * @param string $description information about this test subcase */ public function assertFalse($x, $description = "") { @@ -112,7 +131,12 @@ abstract class UnitTest } /** + * Checks that $x and $y are the same, the result of the + * test is added to $this->test_case_results * + * @param mixed $x a first item to compare + * @param mixed $y a second item to compare + * @param string $description information about this test subcase */ public function assertEqual($x, $y, $description = "") { @@ -128,7 +152,12 @@ abstract class UnitTest } /** + * Checks that $x and $y are not the same, the result of the + * test is added to $this->test_case_results * + * @param mixed $x a first item to compare + * @param mixed $y a second item to compare + * @param string $description information about this test subcase */ public function assertNotEqual($x, $y, $description = "") { @@ -144,12 +173,13 @@ abstract class UnitTest } /** - * + * This method is called before each test case is run to set up the + * given test case */ abstract public function setUp(); /** - * + * This method is called after each test case is run to clean up */ abstract public function tearDown(); diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php index aefbc891d..72c50d3d3 100755 --- a/lib/web_archive_bundle.php +++ b/lib/web_archive_bundle.php @@ -34,17 +34,26 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * Load files we're dependent on if neccesary + * A WebArchiveBundle is a collection of WebArchive, so need definition of + * web archive */ require_once 'web_archive.php'; +/** + * + */ require_once 'bloom_filter_file.php'; +/** + * + */ require_once 'bloom_filter_bundle.php'; +/** + * + */ require_once 'gzip_compressor.php'; /** - * * A web archive bundle is a collection of web archives which are managed * together.It is useful to split data across several archive files rather than * just store it in one, for both read efficiency and to keep filesizes from @@ -59,13 +68,37 @@ require_once 'gzip_compressor.php'; class WebArchiveBundle { + /** + * + */ var $dir_name; + /** + * + */ var $filter_size; + /** + * + */ var $partition = array(); + /** + * + */ var $page_exists_filter_bundle; + /** + * + */ var $num_partitions; + /** + * + */ var $count; + /** + * + */ var $description; + /** + * + */ var $compressor; /** diff --git a/lib/web_queue_bundle.php b/lib/web_queue_bundle.php index dbdec0bc0..13a5abad3 100755 --- a/lib/web_queue_bundle.php +++ b/lib/web_queue_bundle.php @@ -34,14 +34,33 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * Load classes we're dependent on + * We use a variety of bloom filters for handling robots.txt data */ require_once 'bloom_filter_file.php'; -require_once 'bloom_filter_bundle.php'; +/** + * Data on which urls we've already crawled is stored in a bloom filter bundle + */ +require_once 'bloom_filter_bundle.php'; +/** + * Priority queue is used to store a 8 byte ids of urls to crawl next + */ require_once 'priority_queue.php'; +/** + * Hash table is used to store for each id in the priority queue an offset into + * a web archive for that urls id actual complete url + */ require_once 'hash_table.php'; -require_once 'non_compressor.php'; +/** + * Urls are stored in a web archive using a filter that does no compression + */ +require_once 'non_compressor.php'; +/** + * Used to store to crawl urls + */ require_once 'web_archive.php'; +/** + * Used for the crawlHash function + */ require_once 'utility.php'; /** @@ -54,20 +73,67 @@ require_once 'utility.php'; class WebQueueBundle implements Notifier { + /** + * The folder name of this WebQueueBundle + * @var string + */ var $dir_name; + /** + * + * @var int + */ var $filter_size; + /** + * + * @var int + */ var $num_urls_ram; + /** + * + * @var int + */ var $min_or_max; - + /** + * + * @var object + */ var $to_crawl_queue; + /** + * + * @var object + */ var $to_crawl_table; + /** + * + * @var int + */ var $hash_rebuild_count; + /** + * + * @var int + */ var $max_hash_ops_before_rebuild; + /** + * + * @var object + */ var $to_crawl_archive; var $url_exists_filter_bundle; + /** + * + * @var object + */ var $got_robottxt_filter; + /** + * + * @var object + */ var $dissallowed_robot_filter; + /** + * + * @var object + */ var $crawl_delay_filter; const max_url_archive_offset = 1000000000; diff --git a/locale/extract_merge.php b/locale/extract_merge.php index 44bffb0ac..91f55b574 100755 --- a/locale/extract_merge.php +++ b/locale/extract_merge.php @@ -44,6 +44,7 @@ if(isset($_SERVER['DOCUMENT_ROOT']) && strlen($_SERVER['DOCUMENT_ROOT']) > 0) { exit(); } +/** Calculate base directory of script */ define("BASE_DIR", substr($_SERVER['DOCUMENT_ROOT'].$_SERVER['PWD']. $_SERVER["SCRIPT_NAME"], 0, -strlen("locale/extract_merge.php"))); @@ -78,13 +79,13 @@ updateLocales($general_ini, $strings); /** * Cycles through locale subdirectories in LOCALE_DIR, for each - * locale it merges out the current gneral_ini and strings data. + * locale it merges out the current gwneral_ini and strings data. * It deletes identifiers that are not in strings, it adds new identifiers * and it leaves existing identifier translation pairs untouched. * * @param array $general_ini data that would typically come from the * general.ini file - * @param array $string lines from what is equivalent to an ini file of + * @param array $strings lines from what is equivalent to an ini file of * msg_id msg_string pairs these lines also have comments on the file * that strings were extracted from * diff --git a/models/crawl_model.php b/models/crawl_model.php index 6922de4b6..1dcdff303 100755 --- a/models/crawl_model.php +++ b/models/crawl_model.php @@ -60,7 +60,7 @@ class CrawlModel extends Model implements CrawlConstants /** * Stores the name of the current index archive to use to get search * results from - * @Var string + * @var string */ var $index_name; diff --git a/tests/index.php b/tests/index.php index 1ea3c119b..f6546c77f 100644 --- a/tests/index.php +++ b/tests/index.php @@ -34,6 +34,7 @@ * @filesource */ +/** Calculate base directory of script */ define("BASE_DIR", substr($_SERVER['DOCUMENT_ROOT'].$_SERVER['PWD']. $_SERVER["SCRIPT_NAME"], 0, -strlen("tests/index.php")));