diff --git a/bin/fetcher.php b/bin/fetcher.php index b72cfbdcf..59a209e65 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -155,6 +155,12 @@ class Fetcher implements CrawlConstants * @var array */ var $found_sites; + /** + * Urls of duplicate sites that the fetcher hasn't sent to + * the queue_server yet + * @var array + */ + var $found_duplicates; /** * Timestamp from the queue_server of the current schedule of sites to * download. This is sent back to the server once this schedule is completed @@ -217,6 +223,7 @@ class Fetcher implements CrawlConstants $this->to_crawl = array(); $this->to_crawl_again = array(); $this->found_sites = array(); + $this->found_duplicates = array(); $this->sum_seen_title_length = 0; $this->sum_seen_description_length = 0; @@ -320,8 +327,11 @@ class Fetcher implements CrawlConstants $site_pages = FetchUrl::getPages($sites, true); - list($deduplicated_pages, $schedule_again_pages) = + list($deduplicated_pages, $schedule_again_pages, $duplicates) = $this->deduplicateAndReschedulePages($site_pages); + + $this->found_duplicates = array_merge($this->found_duplicates, + $duplicates); if($can_schedule_again == true) { foreach($schedule_again_pages as $schedule_again_page) { if($schedule_again_page[self::CRAWL_DELAY] == 0) { @@ -535,10 +545,12 @@ class Fetcher implements CrawlConstants * Does page deduplication on an array of downloaded pages using a * BloomFilterBundle of $this->web_archive. Deduplication based * on summaries is also done on the queue server. Also, sorts out pages - * for which no content was downloaded so that they cna be scheduled + * for which no content was downloaded so that they can be scheduled * to be crawled again. * * @param array &$site_pages pages to deduplicate + * @return an array conisting of the deduclicated pages, the not_downloaded + * sites, and the urls of duplicate pages. */ function deduplicateAndReschedulePages(&$site_pages) { @@ -546,7 +558,8 @@ class Fetcher implements CrawlConstants $deduplicated_pages = array(); $not_downloaded = array(); - + $duplicates = array(); + $unseen_page_hashes = $this->web_archive->differencePageKeysFilter($site_pages, self::HASH); @@ -560,13 +573,15 @@ class Fetcher implements CrawlConstants $deduplicated_pages[] = $site; } else if(!isset($site[self::HASH])){ $not_downloaded[] = $site; + } else { + $duplicates[] = $site[self::URL]; } } crawlLog(" Delete duplicated pages time". (changeInMicrotime($start_time))); - return array($deduplicated_pages, $not_downloaded); + return array($deduplicated_pages, $not_downloaded, $duplicates); } /** @@ -946,8 +961,11 @@ class Fetcher implements CrawlConstants $average_total_link_text_length = $doc_statistics[self::AVERAGE_TOTAL_LINK_TEXT_LENGTH]; - foreach($doc_statistics as $doc_key => $info) { + $special_case_fields = array(self::INLINKS, self::SITE_INFO, + self::FILETYPE, self::URL_INFO); + foreach($doc_statistics as $doc_key => $info) { + if(in_array($doc_key, $special_case_fields)) {continue;} $title_length = $info[self::TITLE_LENGTH]; $description_length = $info[self::DESCRIPTION_LENGTH]; $link_length = $info[self::LINK_LENGTH]; @@ -1034,7 +1052,7 @@ class Fetcher implements CrawlConstants number_format($doc_rank, PRECISION); //proxy for page rank $orphan = (isset($info[self::LINK_WORDS]) && - count($info[self::LINK_WORDS]) > 0 ) ? 1 : .5; + $info[self::LINK_WORDS] == true) ? 1 : .5; $words[$word_key][$doc_key][self::SCORE] = number_format( .8*($doc_rank) @@ -1044,10 +1062,12 @@ class Fetcher implements CrawlConstants } } - - if(STORE_INLINKS_IN_DICTIONARY && - isset($doc_statistics[self::INLINKS])) { - foreach($doc_statistics[self::INLINKS] + + + //add word_keys for inlink, sites, filetype + foreach($special_case_fields as $special_case_field) { + if(isset($doc_statistics[$special_case_field])) { + foreach($doc_statistics[$special_case_field] as $url_word_key => $docs_info) { foreach($docs_info as $doc_key) { $doc_depth = $doc_statistics[$doc_key][self::DOC_DEPTH] + 1; @@ -1060,8 +1080,20 @@ class Fetcher implements CrawlConstants $words[$url_word_key][$doc_key][self::SCORE] = number_format(11 - $doc_depth, PRECISION); } + } + } + } + foreach($this->found_duplicates as $duplicate) { + $doc_key = crawlHash($duplicate); + $url_word_key = crawlHash("info:".$duplicate); + $words[$url_word_key][$doc_key][self::TITLE_WORD_SCORE] = -1; + $words[$url_word_key][$doc_key][self::DESCRIPTION_WORD_SCORE] = -1; + $words[$url_word_key][$doc_key][self::LINK_WORD_SCORE] = -1; + $words[$url_word_key][$doc_key][self::DOC_RANK] = -1; + $words[$url_word_key][$doc_key][self::SCORE] = -1; } + $this->found_duplicates = array(); $this->found_sites[self::INVERTED_INDEX] = $words; @@ -1080,13 +1112,16 @@ class Fetcher implements CrawlConstants function computeDocumentStatistics() { $doc_statistics = array(); - $this->num_seen_sites += count($this->found_sites[self::SEEN_URLS]); - foreach($this->found_sites[self::SEEN_URLS] as $site) { + $num_seen = count($this->found_sites[self::SEEN_URLS]); + $this->num_seen_sites += $num_seen; + for($i = 0; $i < $num_seen; $i++) { + $site = $this->found_sites[self::SEEN_URLS][$i]; $doc_key = crawlHash($site[self::URL]); $doc_statistics[$doc_key][self::URL_WEIGHT] = 3 - log(strlen($site[self::URL])); //negative except short urls - + $doc_statistics[$doc_key][self::DOC_DEPTH] = + log($site[self::INDEX]*NUM_FETCHERS, 10); $title_phrase_string = mb_ereg_replace("[[:punct:]]", " ", $site[self::TITLE]); $doc_statistics[$doc_key][self::TITLE_WORDS] = @@ -1107,28 +1142,90 @@ class Fetcher implements CrawlConstants $doc_statistics[$doc_key][self::DESCRIPTION_WORDS]); $this->sum_seen_site_description_length += $doc_statistics[$doc_key][self::DESCRIPTION_LENGTH]; - + $doc_statistics[$doc_key][self::LINK_WORDS] = array(); + $doc_statistics[$doc_key][self::LINK_LENGTH] = 0; + // store the sites the doc_key belongs to, so you can search by site + $url_sites = UrlParser::getHostPaths($site[self::URL]); + $url_sites = array_merge($url_sites, + UrlParser::getHostSubdomains($site[self::URL])); + foreach($url_sites as $url_site) { + if(strlen($url_site) > 0) { + $doc_statistics[self::SITE_INFO][ + crawlHash('site:'.$url_site)][] = $doc_key; + } + } + $doc_statistics[self::URL_INFO][ + crawlHash('info:'.$site[self::URL])][] = $doc_key; + + // store the filetype info + $url_type = UrlParser::getDocumentType($site[self::URL]); + if(strlen($url_type) > 0) { + $doc_statistics[self::FILETYPE][ + crawlHash('filetype:'.$url_type)][] = $doc_key; + } + $link_phrase_string = ""; $link_urls = array(); + //store inlinks so they can be searched by + $num_links = count($site[self::LINKS]); + if($num_links > 0) { + $link_weight = $site[self::WEIGHT]/$num_links; + } else { + $link_weight = 0; + } + $had_links = false; + foreach($site[self::LINKS] as $url => $link_text) { - $link_phrase_string .= " $link_text"; - if(STORE_INLINKS_IN_DICTIONARY) { - $doc_statistics[self::INLINKS][crawlHash($url)][] =$doc_key; + if(strlen($url) > 0) { + $summary = array(); + $had_links = true; + $link_text = strip_tags($link_text); + $link_id = + "url|".$url."|text|$link_text|ref|".$site[self::URL]; + $link_key = crawlHash($link_id).":".crawlHash($url).":" + .crawlHash("info:".$url); + $summary[self::URL] = $link_id; + $summary[self::TITLE] = $url; + // stripping html to be on the safe side + $summary[self::DESCRIPTION] = $link_text; + $summary[self::TIMESTAMP] = $site[self::TIMESTAMP]; + $summary[self::ENCODING] = $site[self::ENCODING]; + $summary[self::HASH] = crawlHash($link_id); + $summary[self::TYPE] = "link"; + $summary[self::HTTP_CODE] = "link"; + $summary[self::WEIGHT] = $link_weight; + $this->found_sites[self::SEEN_URLS][] = $summary; + + $doc_statistics[$link_key][self::URL_WEIGHT] = + 3 - log(strlen($url)); + //negative except short urls + $doc_statistics[$link_key][self::TITLE_WORDS] =array(); + $doc_statistics[$link_key][self::TITLE_LENGTH] = 0; + $doc_statistics[$link_key][self::DESCRIPTION_WORDS] = + array(); + $doc_statistics[$link_key][self::DESCRIPTION_LENGTH] = 0; + + $link_text = + mb_ereg_replace("[[:punct:]]", " ", $link_text); + $doc_statistics[$link_key][self::LINK_WORDS] = + PhraseParser::extractPhrasesAndCount($link_text); + $doc_statistics[$link_key][self::LINK_LENGTH] = + $this->sumCountArray( + $doc_statistics[$link_key][self::LINK_WORDS]); + $this->sum_seen_site_link_length += + $doc_statistics[$link_key][self::LINK_LENGTH]; + + $doc_statistics[$link_key][self::DOC_DEPTH] = + log(10*$site[self::INDEX]*NUM_FETCHERS, 10); + //our proxy for page rank, 10=average links/page + $doc_statistics[self::INLINKS][crawlHash('link:'.$url)][] = + $doc_key; } + $this->found_sites[self::SEEN_URLS][$i][self::LINKS] = + $had_links; } - $link_phrase_string = - mb_ereg_replace("[[:punct:]]", " ", $link_phrase_string); - $doc_statistics[$doc_key][self::LINK_WORDS] = - PhraseParser::extractPhrasesAndCount($link_phrase_string); - $doc_statistics[$doc_key][self::LINK_LENGTH] = - $this->sumCountArray( - $doc_statistics[$doc_key][self::LINK_WORDS]); - $this->sum_seen_site_link_length += - $doc_statistics[$doc_key][self::LINK_LENGTH]; - $doc_statistics[$doc_key][self::DOC_DEPTH] = - log($site[self::INDEX]*NUM_FETCHERS, 10); - //our proxy for page rank, 10=average links/page + } $doc_statistics[self::AVERAGE_TITLE_LENGTH] = diff --git a/bin/queue_server.php b/bin/queue_server.php index 44d897579..b47b06dc4 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -503,19 +503,30 @@ class QueueServer implements CrawlConstants $num_seen = 0; } - + $visited_urls_count = 0; for($i = 0; $i < $num_seen; $i++) { $index_archive->addPageFilter(self::HASH, $seen_sites[$i]); $seen_sites[$i][self::MACHINE] = $machine; $seen_sites[$i][self::MACHINE_URI] = $machine_uri; $seen_sites[$i][self::HASH_URL] = crawlHash($seen_sites[$i][self::URL]); + $link_url_parts = explode("|", $seen_sites[$i][self::URL]); + if(strcmp("url", $link_url_parts[0]) == 0 && + strcmp("text", $link_url_parts[2]) == 0) { + $seen_sites[$i][self::HASH_URL] = + crawlHash($seen_sites[$i][self::URL]). + ":".crawlHash($link_url_parts[1]). + ":".crawlHash("info:".$link_url_parts[1]); + } else { + $visited_urls_count++; + } } if(isset($seen_sites)) { $seen_sites = $index_archive->addPages( - self::HASH_URL, self::SUMMARY_OFFSET, $seen_sites); + self::HASH_URL, self::SUMMARY_OFFSET, $seen_sites, + $visited_urls_count); $summary_offsets = array(); foreach($seen_sites as $site) { @@ -727,13 +738,15 @@ class QueueServer implements CrawlConstants "Removing $url from Queue (shouldn't still be there!)"); $this->web_queue->removeQueue($url); } - - array_push($most_recent_urls, $url); - if($cnt >= NUM_RECENT_URLS_TO_DISPLAY) - { - array_shift($most_recent_urls); + if(strpos($url, "url|") !== 0) { + array_push($most_recent_urls, $url); + if($cnt >= NUM_RECENT_URLS_TO_DISPLAY) + { + array_shift($most_recent_urls); + } + $cnt++; } - $cnt++; + } } @@ -809,6 +822,7 @@ class QueueServer implements CrawlConstants $info_bundle = IndexArchiveBundle::getArchiveInfo( CRAWL_DIR.'/cache/'.self::index_data_base_name.$this->crawl_time); $crawl_status['COUNT'] = $info_bundle['COUNT']; + $crawl_status['VISITED_URLS_COUNT'] = $info_bundle['VISITED_URLS_COUNT']; $crawl_status['DESCRIPTION'] = $info_bundle['DESCRIPTION']; file_put_contents( CRAWL_DIR."/schedules/crawl_status.txt", serialize($crawl_status)); @@ -818,7 +832,9 @@ class QueueServer implements CrawlConstants crawlLog( "The current crawl description is: ".$info_bundle['DESCRIPTION']); - crawlLog("Total seen urls so far: ".$info_bundle['COUNT']); + crawlLog("Number of unique pages so far: ". + $info_bundle['VISITED_URLS_COUNT']); + crawlLog("Total urls extracted so far: ".$info_bundle['COUNT']); crawlLog("Of these, the most recent urls are:"); foreach($most_recent_urls as $url) { crawlLog("URL: $url"); diff --git a/configs/config.php b/configs/config.php index 02fc6f00b..6337fe76c 100755 --- a/configs/config.php +++ b/configs/config.php @@ -138,7 +138,7 @@ define('MAX_WAITING_HOSTS', 1000); define('URL_FILTER_SIZE', 10000000); /** number of fetchers that will be used in a given crawl */ -define('NUM_FETCHERS', 3); +define('NUM_FETCHERS', 4); /** * maximum number of urls that will be held in ram diff --git a/controllers/search_controller.php b/controllers/search_controller.php index 2003b37e6..ceddfb63d 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -138,7 +138,8 @@ class SearchController extends Controller implements CrawlConstants } else { $index_time_stamp = 0; //use the default crawl index } - if(isset($_REQUEST['q']) || $activity != "query") { + if(isset($_REQUEST['q']) && strlen($_REQUEST['q']) >0 + || $activity != "query") { if($activity == "query") { $activity_array = $this->extractActivityQuery(); $query = $activity_array[0]; // dirty @@ -224,7 +225,7 @@ class SearchController extends Controller implements CrawlConstants crawlHash($url), $summary_offset); $top_phrases = - $this->phraseModel->getTopPhrases($crawl_item, 20); + $this->phraseModel->getTopPhrases($crawl_item, 3); $top_query = implode(" ", $top_phrases); $phrase_results = $this->phraseModel->getPhrasePageResults( $top_query, $limit, $results_per_page, false); @@ -244,8 +245,10 @@ class SearchController extends Controller implements CrawlConstants break; } - $data['PAGES'] = $phrase_results['PAGES']; - $data['TOTAL_ROWS'] = $phrase_results['TOTAL_ROWS']; + $data['PAGES'] = (isset($phrase_results['PAGES'])) ? + $phrase_results['PAGES']: array(); + $data['TOTAL_ROWS'] = (isset($phrase_results['TOTAL_ROWS'])) ? + $phrase_results['TOTAL_ROWS'] : 0; $data['LIMIT'] = $limit; $data['RESULTS_PER_PAGE'] = $results_per_page; @@ -332,11 +335,20 @@ class SearchController extends Controller implements CrawlConstants $machine_uri, $page, $offset, $crawl_time); $cache_file = $cache_item[self::PAGE]; + $request = $cache_item['REQUEST']; + $meta_words = array('link\:', 'site\:', + 'filetype\:', 'info\:', '\-', + 'index:', 'i:', 'weight:', 'w:'); + foreach($meta_words as $meta_word) { + $pattern = "/(\s)($meta_word(\S)+)/"; + $query = preg_replace($pattern, "", $query); + } $query = str_replace("'", " ", $query); $query = str_replace('"', " ", $query); $query = str_replace('\\', " ", $query); + $query = str_replace('|', " ", $query); $query = $this->clean($query, "string"); $page_url = $url; diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index da45cb25a..0a2dcf79e 100644 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -124,5 +124,9 @@ interface CrawlConstants const PAGE_IMPORTANCE = 'ad'; const MACHINE_URI = 'ae'; + const SITE_INFO = 'af'; + const FILETYPE = 'ag'; + const SUMMARY = 'ah'; + const URL_INFO = 'ai'; } ?> diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index aeefd4967..f048e4128 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -54,28 +54,10 @@ require_once 'utility.php'; */ require_once 'crawl_constants.php'; -/** - * Enumerative interface for common constants between WordIterator and - * IndexArchiveBundle - * - * These constants are used as fields in arrays. They are negative to - * distinguish them from normal array elements 0, 1, 2... However, this - * means you need to be slightly careful if you try to sort the array - * as this might screw things up - * - * @author Chris Pollett - * @package seek_quarry - * @subpackage library +/** + *Loads common constants for word indexing */ -interface IndexingConstants -{ - const COUNT = -1; - const END_BLOCK = -2; - const LIST_OFFSET = -3; - const POINT_BLOCK = -4; - const PARTIAL_COUNT = -5; - const NAME = -6; -} +require_once 'indexing_constants.php'; /** @@ -116,427 +98,7 @@ function setOffsetPointers($data, &$objects, $offset_field) return $data; } -/** - * Used to iterate through the documents associated with a word in - * an IndexArchiveBundle. It also makes it easy to get the summaries - * of these documents and restrict the documents by additional words. - * - * A description of how words and the documents containing them are stored - * is given in the documentation of IndexArchiveBundle. To iterate over - * all documents containng a word, its hash, work_key, is formed. Then using - * the Bloom filter for that partition, it is determined if the word is stored - * at all, and if it is, which generations it occurs in. Then the iterator - * is set to point to the first block of the first generation the word appears - * in that is greater than the limit of the WordIterator. Thereafter, - * nextDocsWithWord will advance $this->current_pointer by one per call. - * $this->current_pointer keeps track of which block of documents containing - * the word to return. If it is less than COMMON_WORD_THRESHOLD/BLOCK_SIZE and - * there are still more blocks, then the corresponding block_pointer of the word - * from the generation's partition info_block is used to look up the offset to - * the doc block. If it is greater than this value then the linked list - * of doc blocks pointed to for the partition is followed to get the appropriate - * block. This list is in the order that words were stored in the index so - * LIST_OFFSET points to the last block stored, which in turn points to the - * next to last block, etc. Finally, when all the blocks in the linked-list are - * exhausted, the remaining docs for that generation for that word are stored - * in the info block for the word itself (this will always be less than - * BLOCK_SIZE many). Once all the docs for a word for a generation have been - * iterated through, than iteration proceeds to the next generation containing - * the word. - * - * @author Chris Pollett - * @package seek_quarry - * @subpackage library - * @see IndexArchiveBundle - */ -class WordIterator implements IndexingConstants, CrawlConstants -{ - /** - * hash of word that the iterator iterates over - * @var string - */ - var $word_key; - /** - * The IndexArchiveBundle this index is associated with - * @var object - */ - var $index; - /** - * The number of documents already iterated over - * @var int - */ - var $seen_docs; - /** - * @var int - */ - var $restricted_seen_docs; - /** - * The number of documents in the current block before filtering - * by restricted words - * @var int - */ - var $count_block_unfiltered; - /** - * Estimate of the number of documents that this iterator can return - * @var int - */ - var $num_docs; - - /** - * If iterating through the linked-list portions of the documents - * the next byte offset in the WebArchive based linked-list - * @var int - */ - var $next_offset; - /** - * Block number of the last block of docs - * @var int - */ - var $last_pointed_block; - /** - * @var int - */ - var $list_offset; - - /** - * Pointers to offsets for blocks containing docs with the given word - * for the current generation - * @var array - */ - var $block_pointers; - /** - * Number of completely full blocks of documents for the current generation - * @var int - */ - var $num_full_blocks; - /** - * Number of generations word appears in - * @var int - */ - var $num_generations; - /** - * Used to store the contents of the last partially full block - * @var int - */ - var $last_block; - /** - * - * @var object - */ - var $info_block; - /** - * Stores the number of the current block of documents we are at in the - * set of all blocks of BLOCK_SIZE many documents - * @var int - */ - var $current_pointer; - /** - * First document that should be returned - * amongst all of the documents associated with the - * iterator's $word_key - * @var int - */ - var $limit; - - /** - * Creates a word iterator with the given parameters. - * - * @param string $word_key hash of word or phrase to iterate docs of - * @param object $index the IndexArchiveBundle to use - * @param int $limit the first element to return from the list of docs - * iterated over - * @param object $info_block the info block of the WebArchive - * associated with the word in the index. If NULL, then this will - * loaded in WordIterator::reset() - */ - public function __construct($word_key, $index, $limit = 0, $info_block = NULL) - { - $this->word_key = $word_key; - $this->index = $index; - $this->limit = $limit; - $this->reset($info_block); - } - - /** - * Returns the iterators to the first document block that it could iterate - * over - * - * @param object $info_block the header block in the index WebArchiveBundle - * for the word this iterator iterates over. If not NULL, this saves - * the time to load it. If not it will be loaded, but this will be - * slower. - */ - public function reset($info_block = NULL) - { - $this->restricted_seen_docs = 0; - $this->count_block_unfiltered = 0; - - $partition = - WebArchiveBundle::selectPartition($this->word_key, - $this->index->num_partitions_index); - - if($info_block == NULL) { - $this->info_block = - $this->index->getPhraseIndexInfo($this->word_key); - } else { - $this->info_block = $info_block; - } - if($this->info_block !== NULL) { - $this->num_generations = count($this->info_block['GENERATIONS']); - $count_till_generation = $this->info_block[self::COUNT]; - - while($this->limit >= $count_till_generation) { - $this->info_block['CURRENT_GENERATION_INDEX']++; - if($this->num_generations <= - $this->info_block['CURRENT_GENERATION_INDEX']) { - $this->num_docs = 0; - $this->current_pointer = -1; - return; - } - $info_block = $this->index->getPhraseIndexInfo( - $this->word_key, - $this->info_block['CURRENT_GENERATION_INDEX'], - $this->info_block); - if($info_block !== NULL) { - $this->info_block = $info_block; - } - $count_till_generation += $this->info_block[self::COUNT]; - } - - - } - - $this->seen_docs = $count_till_generation - - $this->info_block[self::COUNT]; - $this->initGeneration(); - - - } - - /** - * Sets up the iterator to iterate through the current generation. - * - * @return bool whether the initialization succeeds - */ - public function initGeneration() - { - - if($this->info_block !== NULL) { - $info_block = $this->index->getPhraseIndexInfo( - $this->word_key, $this->info_block['CURRENT_GENERATION_INDEX'], - $this->info_block); - if($info_block === NULL) { - return false; - } - $this->info_block = $info_block; - $this->num_docs = $info_block['TOTAL_COUNT']; - $this->num_docs_generation = $info_block[self::COUNT]; - - $this->current_pointer = - max(floor(($this->limit - $this->seen_docs) / BLOCK_SIZE), 0); - $this->seen_docs += $this->current_pointer*BLOCK_SIZE; - $this->last_block = $info_block[self::END_BLOCK]; - $this->num_full_blocks = - floor($this->num_docs_generation / BLOCK_SIZE); - if($this->num_docs_generation > COMMON_WORD_THRESHOLD) { - $this->last_pointed_block = - floor(COMMON_WORD_THRESHOLD / BLOCK_SIZE); - } else { - $this->last_pointed_block = $this->num_full_blocks; - } - - for($i = 0; $i < $this->last_pointed_block; $i++) { - if(isset($info_block[$i])) { - $this->block_pointers[$i] = $info_block[$i]; - } - } - - if($this->num_docs_generation > COMMON_WORD_THRESHOLD) { - if($info_block[self::LIST_OFFSET] === NULL) { - $this->list_offset = NULL; - } else { - $this->list_offset = $info_block[self::LIST_OFFSET]; - } - } - - } else { - $this->num_docs = 0; - $this->num_docs_generation = 0; - $this->current_pointer = -1; - } - return true; - } - /** - * Gets the block of doc summaries associated with the current doc - * pointer and which match the array of additional word restrictions - * @param array $restrict_phrases an array of additional words or phrases - * to see if contained in summary - * @return array doc summaries that match - */ - public function currentDocsWithWord($restrict_phrases = NULL) - { - if($this->num_generations <= - $this->info_block['CURRENT_GENERATION_INDEX']) { - return -1; - } - $generation = - $this->info_block['GENERATIONS'][ - $this->info_block['CURRENT_GENERATION_INDEX']]; - if($this->current_pointer >= 0) { - if($this->current_pointer == $this->num_full_blocks) { - $pages = $this->last_block; - } else if ($this->current_pointer >= $this->last_pointed_block) { - /* if there are more than COMMON_WORD_THRESHOLD many - results and we're not at the last block yet - */ - if($this->list_offset === NULL) { - return -1; - } - $offset = $this->list_offset; - $found = false; - do { - /* the link list is actually backwards to the order we want - For now, we cycle along the list from the last data - stored until we find the block we want. This is slow - but we are relying on the fact that each generation is - not too big. - */ - $doc_block = $this->index->getWordDocBlock($this->word_key, - $offset, $generation); - $word_keys = array_keys($doc_block); - $found_key = NULL; - foreach($word_keys as $word_key) { - if(strstr($word_key, $this->word_key.":")) { - $found_key = $word_key; - if(isset($doc_block[ - $found_key][self::LIST_OFFSET])) { - //only one list offset/docblock - break; - } - } - } - if($found_key === NULL) { - break; - } - if(isset($doc_block[ - $this->word_key.":".$this->current_pointer])) { - $found = true; - break; - } - $offset = $doc_block[$found_key][self::LIST_OFFSET]; - } while($offset != NULL); - if($found != true) { - $pages = array(); - } else { - $pages = $doc_block[ - $this->word_key.":".$this->current_pointer]; - } - } else { - //first COMMON_WORD_THRESHOLD many results fast - if(isset($this->block_pointers[$this->current_pointer])) { - $doc_block = $this->index->getWordDocBlock($this->word_key, - $this->block_pointers[$this->current_pointer], - $generation); - if(isset( - $doc_block[$this->word_key.":".$this->current_pointer] - )) { - $pages = - $doc_block[ - $this->word_key.":".$this->current_pointer]; - } else { - $pages = array(); - } - } else { - $pages = array(); - } - } - - if($this->seen_docs < $this->limit) { - $diff_offset = $this->limit - $this->seen_docs; - - $pages = array_slice($pages, $diff_offset); - } - $this->count_block_unfiltered = count($pages); - - if($restrict_phrases != NULL) { - - $out_pages = array(); - if(count($pages) > 0 ) { - foreach($pages as $doc_key => $doc_info) { - - if(isset($doc_info[self::SUMMARY_OFFSET])) { - - $page = $this->index->getPage( - $doc_key, $doc_info[self::SUMMARY_OFFSET]); - /* build a string out of title, links, - and description - */ - $page_string = mb_strtolower( - PhraseParser::extractWordStringPageSummary( - $page)); - - $found = true; - foreach($restrict_phrases as $phrase) { - if(mb_strpos($page_string, $phrase) - === false) { - $found = false; - } - } - if($found == true) { - $out_pages[$doc_key] = $doc_info; - } - } - } - } - $pages = $out_pages; - } - return $pages; - } else { - return -1; - } - } - - /** - * Get the current block of doc summaries for the word iterator and advances - * the current pointer to the next block - * - * @param array $restrict_phrases additional words to restrict doc summaries - * returned - * @return array doc summaries matching the $restrict_phrases - */ - public function nextDocsWithWord($restrict_phrases = NULL) - { - $doc_block = $this->currentDocsWithWord($restrict_phrases); - if($this->seen_docs < $this->limit) { - $this->seen_docs = $this->count_block_unfiltered + $this->limit; - } else { - $this->seen_docs += $this->count_block_unfiltered; - } - $this->restricted_seen_docs += count($doc_block); - if($doc_block == -1 || !is_array($doc_block)) { - return NULL; - } - - $this->current_pointer ++; - if($this->current_pointer > $this->num_full_blocks) { - $flag = false; - while ($this->info_block['CURRENT_GENERATION_INDEX'] < - $this->num_generations - 1 && !$flag) { - $this->info_block['CURRENT_GENERATION_INDEX']++; - $flag = $this->initGeneration(); - } - if ($this->info_block['CURRENT_GENERATION_INDEX'] >= - $this->num_generations - 1) { - $this->current_pointer = - 1; - } - } - - return $doc_block; - - } - -} /** * Encapsulates a set of web page summaries and an inverted word-index of terms @@ -679,6 +241,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } $this->summaries = new WebArchiveBundle($dir_name."/summaries", $filter_size, $num_partitions_summaries, $description); + $this->summaries->initCountIfNotExists("VISITED_URLS_COUNT"); + $this->num_partitions_summaries = $this->summaries->num_partitions; $this->index = new WebArchiveBundle( @@ -699,12 +263,16 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants * @param string $key_field field used to select partition * @param string $offset_field field used to record offsets after storing * @param array &$pages data to store + * @param int $visited_urls_count number to add to the count of visited urls + * (visited urls is a smaller number than the total count of objects + * stored in the index). * @return array $pages adjusted with offset field */ - public function addPages($key_field, $offset_field, $pages) + public function addPages($key_field, $offset_field, $pages, + $visited_urls_count) { $result = $this->summaries->addPages($key_field, $offset_field, $pages); - + $this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT"); return $result; } @@ -828,7 +396,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants $tmp = array_merge($block_data[$word_key][self::END_BLOCK],$docs_info); - uasort($tmp, "scoreOrderCallback"); + uasort($tmp, "docRankOrderCallback"); $add_cnt = count($tmp); $num_blocks = floor($add_cnt / BLOCK_SIZE); $block_data[$word_key][self::END_BLOCK] = @@ -936,63 +504,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants return true; } - /** - * Gets doc summaries of documents containing a given word and meeting the - * additional provided criteria - * @param string $word_key the word to iterate over to get document results - * of - * @param int $limit number of first document in order to return - * @param int $num number of documents to return summaries of - * @param array $restrict_phrases additional words and phrase to store - * further restrict the search - * @param string $phrase_key a hash of the word and restricted phrases to - * store the results of the look up - * @param array $phrase_info info block of the word - * @return array document summaries - */ - public function getSummariesByHash($word_key, $limit, $num, - $restrict_phrases = NULL, $phrase_key = NULL, $phrase_info = NULL) - { - if($phrase_key == NULL) { - $phrase_key = $word_key; - } - - if($phrase_info == NULL) { - $phrase_info = $this->getPhraseIndexInfo($phrase_key); - } - - if($phrase_info == NULL || (isset($phrase_info[self::PARTIAL_COUNT]) - && $phrase_info[self::PARTIAL_COUNT] < $limit + $num)) { - $this->addPhraseIndex( - $word_key, $restrict_phrases, $phrase_key, $limit + $num); - } - - $iterator = new WordIterator($phrase_key, $this, $limit, $phrase_info); - - $num_retrieved = 0; - $pages = array(); - - while(is_array($next_docs = $iterator->nextDocsWithWord()) && - $num_retrieved < $num) { - $num_docs_in_block = count($next_docs); - - foreach($next_docs as $doc_key => $doc_info) { - if(isset($doc_info[self::SUMMARY_OFFSET])) { - $page = $this->getPage( - $doc_key, $doc_info[self::SUMMARY_OFFSET]); - $pages[] = array_merge($doc_info, $page); - $num_retrieved++; - } - if($num_retrieved >= $num) { - break 2; - } - } - } - $results['TOTAL_ROWS'] = $iterator->num_docs; - $results['PAGES'] = $pages; - return $results; - } - /** * Gets the page out of the summaries WebArchiveBundle with the given * key and offset @@ -1108,7 +619,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants WebArchiveBundle::selectPartition( $phrase_key, $this->num_partitions_index); $info = array(); - if($info_block == NULL) { if(!$this->initPartitionIndexFilter($partition)) { @@ -1116,7 +626,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } $filter = & $this->index_partition_filters[$partition]; - if(!$filter->contains($phrase_key)) { + if($filter == NULL || !$filter->contains($phrase_key)) { return NULL; } @@ -1196,86 +706,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants } - /** - * Adds the supplied phrase to the IndexArchiveBundle. - * - * The most selective word in the phrase is $word_key, the additional - * words are in $restrict_phrases, the hash of the phrase to add is - * $phrase_key, and if the will be a lot of results compute at least - * the first $num_needed. - * - * @param string $word_key hash of most selective word in phrase - * @param array $restrict_phrases additional words in phrase - * @param string $phrase_key hash of phrase to add - * @param $num_needed minimum number of doc results to save if possible - */ - public function addPhraseIndex($word_key, $restrict_phrases, - $phrase_key, $num_needed) - { - if($phrase_key == NULL) { - return; - } - - $partition = - WebArchiveBundle::selectPartition($phrase_key, - $this->num_partitions_index); - - $iterator = new WordIterator($word_key, $this); - $current_count = 0; - $buffer = array(); - $word_data = array(); - $partial_flag = false; - $first_time = true; - - while(is_array($next_docs = - $iterator->nextDocsWithWord($restrict_phrases))) { - $buffer = array_merge($buffer, $next_docs); - $cnt = count($buffer); - - if($cnt > COMMON_WORD_THRESHOLD) { - $word_data[$phrase_key] = - array_slice($buffer, 0, COMMON_WORD_THRESHOLD); - - $this->addPartitionWordData($partition, $word_data, $first_time); - $first_time = false; - $buffer = array_slice($buffer, COMMON_WORD_THRESHOLD); - $current_count += COMMON_WORD_THRESHOLD; - - if($current_count > $num_needed) { - /* notice $num_needed only plays a role when - greater than COMMON_WORD_THRESHOLD - */ - $partial_flag = true; - break; - } - } - } - - $word_data[$phrase_key] = $buffer; - - $this->addPartitionIndexFilter( - $partition, - "delete". $phrase_key . ($this->generation_info['ACTIVE'] - 1)); - - $this->addPartitionWordData($partition, $word_data); - $this->addPartitionIndexFilter($partition, $phrase_key); - $this->addPartitionIndexFilter($partition, $phrase_key . - $this->generation_info['ACTIVE']); - $this->index_partition_filters[$partition]->save(); - file_put_contents($this->dir_name."/generation.txt", - serialize($this->generation_info)); - - $block_info = $this->readPartitionInfoBlock($partition); - $info = $block_info[$phrase_key]; - $current_count += count($buffer); - if($partial_flag) { - $info[self::PARTIAL_COUNT] = $current_count; - $info[self::COUNT] = - floor($current_count*$iterator->num_docs/$iterator->seen_docs); - $this->setPhraseIndexInfo($phrase_key, $info); - } - } - /** * Computes the words which appear in the fewest or most documents * @@ -1296,6 +726,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants $words_array[$word_key] = $info['TOTAL_COUNT']; } else { $words_array[$word_key] = 0; + return NULL; } } diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php new file mode 100644 index 000000000..c92ae4098 --- /dev/null +++ b/lib/index_bundle_iterators/group_iterator.php @@ -0,0 +1,361 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage library + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + *Loads common constants for word indexing + */ +require_once BASE_DIR.'/lib/indexing_constants.php'; + +/** + *Loads base class for iterating + */ +require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php'; + +/** + * This iterator is used to group together documents or document parts + * which share the same url. For instance, a link document item and + * the document that it links to will both be stored in the IndexArchiveBundle + * by the QueueServer. This iterator would combine both these items into + * a single document result with a sum of their score, and a summary, if + * returned, containing text from both sources. The iterator's purpose is + * vaguely analagous to a SQL GROUP BY clause + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage library + * @see IndexArchiveBundle + */ +class GroupIterator extends IndexBundleIterator +{ + /** + * The iterator we are using to get documents from + * @var string + */ + var $index_bundle_iterator; + + /** + * The number of documents in the current block before filtering + * by restricted words + * @var int + */ + var $count_block_unfiltered; + /** + * The number of documents in the current block after filtering + * by restricted words + * @var int + */ + var $count_block; + + var $current_block_hashes; + + /** + * The number of iterated docs before the restriction test + * @var int + */ + var $seen_docs_unfiltered; + + /** + * hashed url keys used to keep track of track of groups seen so far + * @var array + */ + var $grouped_keys; + + + /** + * Creates a group iterator with the given parameters. + * + * @param object $index_bundle_iterator to use as a source of documents + * to iterate over + * @param int $limit the first element to return from the list of docs + * iterated over + */ + function __construct($index_bundle_iterator, $limit = 0) + { + $this->index_bundle_iterator = $index_bundle_iterator; + $this->limit = $limit; + $this->num_docs = $this->index_bundle_iterator->num_docs; + $this->reset(); + } + + /** + * Returns the iterators to the first document block that it could iterate + * over + */ + function reset() + { + $this->index_bundle_iterator->reset(); + $time = time(); + $this->grouped_keys = array(); + // -1 == never save, so file name not used using time to be safer + $this->seen_docs = 0; + $this->seen_docs_unfiltered = 0; + $beneath_limit = true; + while($beneath_limit == true) { + + $doc_block = $this->currentDocsWithWord(); + if($doc_block == -1 || !is_array($doc_block)) { + $beneath_limit = false; + continue; + } + if($this->seen_docs + $this->count_block >= $this->limit) { + $beneath_limit = false; + continue; + } + $this->advance(); + } + + } + + /** + * Hook function used by currentDocsWithWord to return the current block + * of docs if it is not cached + * + * @return mixed doc ids and score if there are docs left, -1 otherwise + */ + function findDocsWithWord() + { + $pages = + $this->index_bundle_iterator->currentDocsWithWord(); + $this->count_block_unfiltered = count($pages); + if(!is_array($pages)) { + return $pages; + } + + $this->current_block_hashes = array(); + $pre_out_pages = array(); + + if($this->count_block_unfiltered > 0 ) { + $i = $this->seen_docs; + foreach($pages as $doc_key => $doc_info) { + if(!is_array($doc_info)) {continue;} + $doc_info['KEY'] = $doc_key; + $doc_key_parts = explode(":", $doc_key); + if(count($doc_key_parts) == 1) { + $hash_url = $doc_key_parts[0]; + $doc_info['IS_PAGE'] = true; + } else { + $hash_url = $doc_key_parts[1]; + $doc_info['IS_PAGE'] = false; + } + if(isset($this->grouped_keys[$hash_url])) { + if( $i < $this->limit) { + continue; + } else { + if(isset($pre_out_pages[$hash_url]) ) { + $pre_out_pages[$hash_url][] = $doc_info; + if($doc_info['IS_PAGE'] == true) { + $pre_out_pages[$hash_url]['IS_PAGE'] = true; + } else { + $pre_out_pages[$hash_url]['HASH_INFO_URL'] = + $doc_key_parts[2]; + } + } + } + } else { + + $pre_out_pages[$hash_url][] = $doc_info; + if($doc_info['IS_PAGE'] == true) { + $pre_out_pages[$hash_url]['IS_PAGE'] = true; + } else { + $pre_out_pages[$hash_url]['HASH_INFO_URL'] = + $doc_key_parts[2]; + } + $this->current_block_hashes[] = $hash_url; + $i++; + } + } + //get summary page for groups of link data if exists and don't have + foreach($pre_out_pages as $hash_url => $data) { + if(!isset($data['IS_PAGE'])) { + $hash_info_url= $pre_out_pages[$hash_url]['HASH_INFO_URL']; + $word_iterator = + new WordIterator($hash_info_url, + $this->getIndex(), 0); + $doc_array = $word_iterator->currentDocsWithWord(); + if(is_array($doc_array) && count($doc_array) == 1) { + $keys = array_keys($doc_array); + $key = $keys[0]; + if($doc_array[$key][self::SCORE] > 0) { + $pre_out_pages[$hash_url][$key] = $doc_array[$key]; + $pre_out_pages[$hash_url][$key]['IS_PAGE'] = true; + } else { + unset($pre_out_pages[$hash_url]); + } + } + } else { + unset($pre_out_pages[$hash_url]['IS_PAGE']); + } + if(isset($pre_out_pages[$hash_url]['HASH_INFO_URL'])) { + unset($pre_out_pages[$hash_url]['HASH_INFO_URL']); + } + } + $this->count_block = count($pre_out_pages); + + if($this->seen_docs < $this->limit) { + $total_docs = $this->seen_docs + $this->count_block; + if($total_docs < $this->limit) { + $pre_out_pages =array(); + } else { + $pre_out_pages = array_slice($pre_out_pages, + $this->limit - $this->seen_docs, NULL, true); + } + } + $out_pages = array(); + foreach($pre_out_pages as $hash_url => $group_infos) { + foreach($group_infos as $doc_info) { + $is_page = $doc_info['IS_PAGE']; + unset($doc_info['IS_PAGE']); + if(!isset($out_pages[$hash_url])) { + $out_pages[$hash_url] = $doc_info; + $out_pages[$hash_url][self::SUMMARY_OFFSET] = array(); + if(isset($doc_info[self::SUMMARY_OFFSET]) ) { + $out_pages[$hash_url][self::SUMMARY_OFFSET] = + array(array($doc_info["KEY"], + $doc_info[self::SUMMARY_OFFSET])); + unset($out_pages[$hash_url]["KEY"]); + } + } else { + $fields = array_keys($out_pages[$hash_url]); + foreach($fields as $field) { + if(isset($doc_info[$field]) && + $field != self::SUMMARY_OFFSET) { + $out_pages[$hash_url][$field] += + $doc_info[$field]; + } else if($field == self::SUMMARY_OFFSET && + $is_page == true) { + array_unshift($out_pages[$hash_url][$field], + array($hash_url, $doc_info[$field])); + } else if($field == self::SUMMARY_OFFSET) { + $out_pages[$hash_url][$field][] = + array($doc_info["KEY"], $doc_info[$field]); + } + } + } + } + } + $pages = $out_pages; + } + $this->pages = $pages; + return $pages; + + } + + /** + * Gets the summaries associated with the keys provided the keys + * can be found in the current block of docs returned by this iterator + * @param array $keys keys to try to find in the current block of returned + * results + * @return array doc summaries that match provided keys + */ + function getSummariesFromCurrentDocs($keys = NULL) + { + if($this->current_block_fresh == false) { + $result = $this->currentDocsWithWord(); + if(!is_array($result)) { + return $result; + } + } + if(!is_array($this->pages)) { + return $this->pages; + } + if($keys == NULL) { + $keys = array_keys($this->pages); + } + $out_pages = array(); + foreach($keys as $doc_key) { + if(!isset($this->pages[$doc_key])) { + continue; + } else { + $doc_info = $this->pages[$doc_key]; + } + if(isset($doc_info[self::SUMMARY_OFFSET]) && + is_array($doc_info[self::SUMMARY_OFFSET])) { + $out_pages[$doc_key] = $doc_info; + foreach($doc_info[self::SUMMARY_OFFSET] as $offset_array) { + list($key, $summary_offset) = $offset_array; + $index = $this->getIndex($key); + $page = $index->getPage( + $key, $summary_offset); + if(!isset($out_pages[$doc_key][self::SUMMARY])) { + $out_pages[$doc_key][self::SUMMARY] = $page; + } else if (isset($page[self::DESCRIPTION])) { + $out_pages[$doc_key][self::SUMMARY][self::DESCRIPTION].= + " .. ".$page[self::DESCRIPTION]; + } + } + } + } + return $out_pages; + + } + + + + /** + * Forwards the iterator one group of docs + */ + function advance() + { + $this->advanceSeenDocs(); + + $this->seen_docs_unfiltered += $this->count_block_unfiltered; + + if($this->seen_docs_unfiltered > 0) { + $this->num_docs = + floor(($this->seen_docs*$this->index_bundle_iterator->num_docs)/ + $this->seen_docs_unfiltered); + } else { + $this->num_docs = 0; + } + + + foreach($this->current_block_hashes as $hash_url) { + $this->grouped_keys[$hash_url] = true; + } + + $this->index_bundle_iterator->advance(); + + } + + /** + * Returns the index associated with this iterator + * @return object the index + */ + function getIndex($key = NULL) + { + return $this->index_bundle_iterator->getIndex($key); + } +} +?> diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php new file mode 100644 index 000000000..6759e80fa --- /dev/null +++ b/lib/index_bundle_iterators/index_bundle_iterator.php @@ -0,0 +1,215 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage library + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + *Loads common constants for word indexing + */ +require_once BASE_DIR.'/lib/indexing_constants.php'; + +/** + * Abstract classed used to model iterating documents indexed in + * an IndexArchiveBundle or set of such bundles. + * + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage library + * @see IndexArchiveBundle + */ +abstract class IndexBundleIterator implements IndexingConstants, CrawlConstants +{ + + /** + * Estimate of the number of documents that this iterator can return + * @var int + */ + var $num_docs; + + /** + * The number of documents already iterated over + * @var int + */ + var $seen_docs; + + /** + * First document that should be returned + * amongst all of the documents associated with the + * iterator's $word_key + * @var int + */ + var $limit; + /** + * The number of documents in the current block + * @var int + */ + var $count_block; + + /** + * Cache of what currentDocsWithWord returns + * @var array + */ + var $pages; + + /** + * Says whether the value in $this->count_block is up to date + * @var bool + */ + var $current_block_fresh; + + + + /** + * Returns the iterators to the first document block that it could iterate + * over + */ + abstract function reset(); + + /** + * Forwards the iterator one group of docs + */ + abstract function advance(); + /** + * Returns the index associated with this iterator + * @return object the index + */ + abstract function getIndex($key = NULL); + + /** + * Hook function used by currentDocsWithWord to return the current block + * of docs if it is not cached + * + * @return mixed doc ids and score if there are docs left, -1 otherwise + */ + abstract function findDocsWithWord(); + + /** + * Gets the current block of doc ids and score associated with the + * this iterators word + * + * @param bool $with_summaries specifies whether or not to return the + * summaries associated with the document + * @return mixed doc ids and score if there are docs left, -1 otherwise + */ + function currentDocsWithWord() + { + if($this->current_block_fresh == true) { + return $this->pages; + } + $this->current_block_fresh = true; + return $this->findDocsWithWord(); + } + + /** + * Gets the summaries associated with the keys provided the keys + * can be found in the current block of docs returned by this iterator + * @param array $keys keys to try to find in the current block of returned + * results + * @return array doc summaries that match provided keys + */ + function getSummariesFromCurrentDocs($keys = NULL) + { + + $index = $this->getIndex(); + if($this->current_block_fresh == false) { + $pages = $this->currentDocsWithWord(); + if(!is_array($pages)) { + return $pages; + } + } else { + $pages = & $this->pages; + } + if($keys == NULL) { + if(is_array($pages)) { + $keys = array_keys($pages); + } else { + return NULL; + } + } + $out_pages = array(); + + foreach($keys as $doc_key) { + if(!isset($pages[$doc_key])) { + continue; + } else { + $doc_info = $pages[$doc_key]; + } + if(isset($doc_info[self::SUMMARY_OFFSET])) { + $page = $index->getPage( + $doc_key, $doc_info[self::SUMMARY_OFFSET]); + $out_pages[$doc_key] = $doc_info; + $out_pages[$doc_key][self::SUMMARY] = $page; + } + }; + return $out_pages; + } + + /** + * Get the current block of doc summaries for the word iterator and advances + * the current pointer to the next block + * + * @return array doc summaries matching the $this->restrict_phrases + */ + function nextDocsWithWord() + { + $doc_block = $this->getSummariesFromCurrentDocs(); + + if($doc_block == -1 || !is_array($doc_block) ) { + return NULL; + } + + $this->advance(); + + return $doc_block; + + } + + /** + * Updates the seen_docs count during an advance() call + */ + function advanceSeenDocs() + { + + if($this->current_block_fresh != true) { + $doc_block = $this->currentDocsWithWord(); + if($doc_block == -1 || !is_array($doc_block) ) { + return; + } + } + $this->current_block_fresh = false; + $this->seen_docs += $this->count_block; + } + +} +?> diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php new file mode 100644 index 000000000..8a4d6d5b5 --- /dev/null +++ b/lib/index_bundle_iterators/intersect_iterator.php @@ -0,0 +1,302 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage library + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + *Loads common constants for word indexing + */ +require_once BASE_DIR.'/lib/indexing_constants.php'; + +/** + *Loads BloomFilterFile to remember things we've already grouped + */ +require_once BASE_DIR.'/lib/bloom_filter_file.php'; + + +/** + *Loads base class for iterating + */ +require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php'; + +/** + * Used to iterate over the documents which occur in all of a set of + * WordIterator results + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage library + * @see IndexArchiveBundle + */ +class IntersectIterator extends IndexBundleIterator +{ + /** + * An array of iterators whose interection we get documents from + * @var array + */ + var $index_bundle_iterators; + /** + * Number of elements in $this->index_bundle_iterators + * @var int + */ + var $num_iterators; + + /** + * The number of documents in the current block before filtering + * by restricted words + * @var int + */ + var $count_block_unfiltered; + /** + * The number of documents in the current block after filtering + * by restricted words + * @var int + */ + var $count_block; + + /** + * The number of iterated docs before the restriction test + * @var int + */ + var $seen_docs_unfiltered; + + /** + * Index of the iterator amongst those we are intersecting to advance + * next + * @var int + */ + var $to_advance_index; + + /** + * Creates an intersect iterator with the given parameters. + * + * @param object $index_bundle_iterator to use as a source of documents + * to iterate over + * @param int $limit the first element to return from the list of docs + * iterated over + */ + function __construct($index_bundle_iterators, $limit = 0) + { + $this->index_bundle_iterators = $index_bundle_iterators; + $this->limit = $limit; + + $this->num_iterators = count($index_bundle_iterators); + $this->num_docs = -1; + + /* + the most results we can return is the size of the least num_docs + of what we are itrerating over + */ + for($i = 0; $i < $this->num_iterators; $i++) { + if( $this->num_docs < 0 || + $this->index_bundle_iterators[$i]->num_docs < $this->num_docs) { + $this->num_docs = $this->index_bundle_iterators[$i]->num_docs; + } + } + $this->reset(); + } + + /** + * Returns the iterators to the first document block that it could iterate + * over + */ + function reset() + { + foreach($this->index_bundle_iterators as $iterator) { + $iterator->reset(); + } + + $this->seen_docs = 0; + $this->seen_docs_unfiltered = 0; + $beneath_limit = true; + while($beneath_limit == true) { + $doc_block = $this->currentDocsWithWord(); + if($doc_block == -1 || !is_array($doc_block)) { + $beneath_limit = false; + continue; + } + if($this->seen_docs + $this->count_block >= $this->limit) { + $beneath_limit = false; + continue; + } + $this->advance(); + } + } + + /** + * Hook function used by currentDocsWithWord to return the current block + * of docs if it is not cached + * + * @return mixed doc ids and rank if there are docs left, -1 otherwise + */ + function findDocsWithWord() + { + $pages = array(); + $high_ranks = array(); + $last = $this->num_iterators - 1; + for($i = 0; $i < $this->num_iterators; $i++) { + $pages[$i] = + $this->index_bundle_iterators[$i]->currentDocsWithWord(); + if(!is_array($pages[$i]) || count($pages[$i]) == 0) { + $this->to_advance_index = $i; + return $pages[$i]; + } + list($low_ranks[$i], $high_ranks[$i]) = + $this->lowHighRanks($pages[$i], $i); + } + uasort($low_ranks, "docRankOrderCallback"); + + $low_ranks = array_values($low_ranks); + + $low_rank = $low_ranks[$last][self::DOC_RANK]; + + $this->to_advance_index = $low_ranks[0]["INDEX"]; + $this->count_block_unfiltered = count($pages[$this->to_advance_index]); + + $docs = array(); + $looping = true; + + while ($looping == true) { + for($i = 0; $i <= $last; $i++) { + list( ,$high_ranks[$i]) = + $this->lowHighRanks($pages[$i], $i, false); + } + $broke = false; + $score = 0; + $high_rank = $high_ranks[0][self::DOC_RANK]; + $high_key = $high_ranks[0]["KEY"]; + $high_index = $high_ranks[0]["INDEX"]; + $to_deletes = array(); + for($i = 1; $i <= $last; $i++) { + if($high_ranks[$i][self::DOC_RANK] < $low_rank ) { + $looping = false; + break 2; + } + if($high_ranks[$i][self::DOC_RANK] > $high_rank || + ($high_ranks[$i][self::DOC_RANK] == $high_rank && + strcmp($high_ranks[$i]["KEY"], $high_key) > 0) + ) { + $broke = true; + $high_rank = $high_ranks[$i][self::DOC_RANK]; + $high_index = $high_ranks[$i]["INDEX"]; + $high_key = $high_ranks[$i]["KEY"]; + $to_deletes[$high_index] = $high_key; + } + $score += $high_ranks[$i][self::SCORE]; + } + if($broke == false) { + $docs[$high_key] = $pages[$high_index][$high_key]; + $docs[$high_key][self::SCORE] = $score; + $to_deletes[$high_index] = $high_key; + } + + foreach($to_deletes as $index => $key) { + unset($pages[$index][$key]); + if(count($pages[$index]) == 0) { + $looping = false; + } + } + + } + $this->count_block = count($docs); + $this->pages = $docs; + return $docs; + } + + /** + * Given a collection of documents, returns info about the low and high + * ranking documents. Namely, their ranks, keys, + * index in word iterator array, and scores + * + * @param array &$docs documents to get low high info from + * @param int $index which word iterator these docs came from + * @param boo $sort_flag whether to sort the docs (if true) or to assume + * the docs are already sorted by rank + * @return array desired info + */ + function lowHighRanks(&$docs, $index, $sort_flag = true) + { + if($sort_flag == true) { + uasort($docs, "docRankOrderCallback"); + } + reset($docs); + $high = array(); + $high["KEY"] = key($docs); + $high[self::DOC_RANK] = $docs[$high["KEY"]][self::DOC_RANK]; + $high[self::SCORE] = $docs[$high["KEY"]][self::SCORE]; + $high["INDEX"] = $index; + end($docs); + $low = array(); + $low["KEY"] = key($docs); + $low[self::DOC_RANK] = $docs[$low["KEY"]][self::DOC_RANK]; + $low[self::SCORE] = $docs[$low["KEY"]][self::SCORE]; + $low["INDEX"] = $index; + return array($low, $high); + } + + /** + * Forwards the iterator one group of docs + */ + function advance() + { + $this->advanceSeenDocs(); + + $this->seen_docs_unfiltered += $this->count_block_unfiltered; + + $min_num_docs = 10000000000; + for($i = 0; $i < $this->num_iterators; $i++) { + if($this->index_bundle_iterators[$i]->num_docs < $min_num_docs) { + $min_num_docs = $this->index_bundle_iterators[$i]->num_docs; + } + } + if($this->seen_docs_unfiltered > 0) { + $this->num_docs = + floor(($this->seen_docs * $min_num_docs) / + $this->seen_docs_unfiltered); + } else { + $this->num_docs = 0; + } + $this->index_bundle_iterators[$this->to_advance_index]->advance(); + + } + + /** + * Returns the index associated with this iterator + * @return object the index + */ + function getIndex($key = NULL) + { + return $this->index_bundle_iterators[0]->getIndex($key = NULL); + } +} +?> diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php new file mode 100644 index 000000000..943a84188 --- /dev/null +++ b/lib/index_bundle_iterators/phrase_filter_iterator.php @@ -0,0 +1,311 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage library + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + *Loads common constants for word indexing + */ +require_once BASE_DIR.'/lib/indexing_constants.php'; + +/** + *Loads base class for iterating + */ +require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php'; + +/** + * Used to iterate through a collection of documents to return only those + * which have certain restricted_phrases and don't have disallowed_phrases. + * + * For restricted_phrases a string like "Chris * Homepage" will match any + * string where * has been replace by any other string. So for example it will + * match Chris Pollett's Homepage. + * + * disallowed_phrases are really just disallowed words and must be an exact + * match + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage library + * @see IndexArchiveBundle + */ +class PhraseFilterIterator extends IndexBundleIterator +{ + /** + * The iterator we are using to get documents from + * @var string + */ + var $index_bundle_iterator; + + /** + * This iterator returns only documents containing all the elements of + * restrict phrases + * @var array + */ + var $restrict_phrases; + + /** + * This iterator returns only documents not containing any the elements of + * disallow phrases + * @var array + */ + var $disallow_phrases; + /** + * The number of documents in the current block before filtering + * by restricted words + * @var int + */ + var $count_block_unfiltered; + + /** + * The number of iterated docs before the restriction test + * @var int + */ + var $seen_docs_unfiltered; + + /** + * Doc block with summaries for current doc block + * @var array + */ + var $summaries; + + /** + * A weighting factor to multiply with each doc SCORE returned from this + * iterator + * @var float + */ + var $weight; + + /** + * Creates a phrase filter iterator with the given parameters. + * + * @param object $index_bundle_iterator to use as a source of documents + * to iterate over + * @param array $restrict_phrases this iterator returns only documents from + * $index_bundle_iterator containing all the elements of restrict + * phrases + * @param array $disallow_phrases this iterator returns only documents from + * $index_bundle_iterator not containing any of the words in disallow + * phrases + * @param float $weight a quantity to multiply each score returned from + * this iterator with + * @param int $limit the first element to return from the list of docs + * iterated over + */ + function __construct($index_bundle_iterator, $restrict_phrases, + $disallow_phrases, $weight = 1, $limit = 0) + { + $this->index_bundle_iterator = $index_bundle_iterator; + $this->limit = $limit; + $this->restrict_phrases = $restrict_phrases; + $this->disallow_phrases = $disallow_phrases; + $this->num_docs = $this->index_bundle_iterator->num_docs; + $this->weight = $weight; + $this->current_block_fresh = false; + $this->reset(); + } + + /** + * Returns the iterators to the first document block that it could iterate + * over + */ + function reset() + { + $this->index_bundle_iterator->reset(); + $this->seen_docs = 0; + $this->seen_docs_unfiltered = 0; + $beneath_limit = true; + while($beneath_limit == true) { + $doc_block = $this->currentDocsWithWord(); + if($doc_block == -1 || !is_array($doc_block)) { + $beneath_limit = false; + continue; + } + if($this->seen_docs + $this->count_block > $this->limit) { + $beneath_limit = false; + continue; + } + $this->advance(); + } + } + + /** + * Hook function used by currentDocsWithWord to return the current block + * of docs if it is not cached + * + * @return mixed doc ids and score if there are docs left, -1 otherwise + */ + function findDocsWithWord() + { + $pages = $this->index_bundle_iterator->getSummariesFromCurrentDocs(); + $this->count_block_unfiltered = count($pages); + if(!is_array($pages)) { + return $pages; + } + + $out_pages = array(); + if(count($pages) > 0 ) { + foreach($pages as $doc_key => $doc_info) { + if(isset($doc_info[self::SUMMARY_OFFSET])) { + /* + if have SUMMARY_OFFSET then should have tried to get + TITLE, etc. + */ + $page_string = + PhraseParser::extractWordStringPageSummary( + $doc_info[self::SUMMARY]); + + $found = true; + + if($this->restrict_phrases != NULL) { + foreach($this->restrict_phrases as $pre_phrase) { + $phrase_parts = explode("*", $pre_phrase); + + $phrase = ""; + $first= ""; + foreach($phrase_parts as $part) {; + $phrase .= $first . preg_quote($part); + $first= '(.)*'; + } + + if(strlen($phrase) > 0 && + mb_eregi($phrase, $page_string) === false) { + $found = false; + } + } + } + if($this->disallow_phrases != NULL && + is_array($this->disallow_phrases)) { + foreach($this->disallow_phrases as $phrase) { + if(strlen($phrase) > 0 && + mb_eregi($phrase, $page_string) !== false) { + $found = false; + } + } + } + if($found == true) { + $doc_info["WEIGHT"] = $this->weight; + $doc_info[self::SCORE] *= $this->weight; + $out_pages[$doc_key] = $doc_info; + } + } + } + $pages = $out_pages; + } + $this->count_block = count($pages); + + if($this->seen_docs < $this->limit) { + $total_docs = $this->seen_docs + $this->count_block; + if($total_docs < $this->limit) { + $pages =array(); + } else { + $pages = array_slice($pages, + $this->limit - $this->seen_docs, NULL, true); + } + } + $this->summaries = $pages; + $this->pages = array(); + foreach($pages as $doc_key => $doc_info) { + $this->pages[$doc_key] = $doc_info; + unset($this->pages[$doc_key][self::SUMMARY]); + } + return $pages; + + } + + /** + * Gets the summaries associated with the keys provided the keys + * can be found in the current block of docs returned by this iterator + * @param array $keys keys to try to find in the current block of returned + * results + * @return array doc summaries that match provided keys + */ + function getSummariesFromCurrentDocs($keys = NULL) + { + if($this->current_block_fresh == false) { + $result = $this->currentDocsWithWord(); + if(!is_array($result)) { + return $result; + } + } + if(!is_array($this->pages)) { + return $this->pages; + } + if($keys == NULL) { + $keys = array_keys($this->pages); + } + $out_pages = array(); + foreach($keys as $doc_key) { + if(!isset($this->summaries[$doc_key])) { + continue; + } else { + $out_pages[$doc_key] = $this->summaries[$doc_key]; + } + } + return $out_pages; + } + + + /** + * Forwards the iterator one group of docs + */ + function advance() + { + $this->advanceSeenDocs(); + + + $this->seen_docs_unfiltered += $this->count_block_unfiltered; + + + + if($this->seen_docs_unfiltered > 0) { + $this->num_docs = + floor(($this->seen_docs*$this->index_bundle_iterator->num_docs)/ + $this->seen_docs_unfiltered); + } else { + $this->num_docs = 0; + } + + $this->index_bundle_iterator->advance(); + } + + /** + * Returns the index associated with this iterator + * @return object the index + */ + function getIndex($key = NULL) + { + return $this->index_bundle_iterator->getIndex($key = NULL); + } +} +?> diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php new file mode 100644 index 000000000..4c8ae8101 --- /dev/null +++ b/lib/index_bundle_iterators/union_iterator.php @@ -0,0 +1,260 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage library + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + *Loads common constants for word indexing + */ +require_once BASE_DIR.'/lib/indexing_constants.php'; + +/** + *Loads BloomFilterFile to remember things we've already grouped + */ +require_once BASE_DIR.'/lib/bloom_filter_file.php'; + + +/** + *Loads base class for iterating + */ +require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php'; + +/** + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage library + * @see IndexArchiveBundle + */ +class UnionIterator extends IndexBundleIterator +{ + /** + * An array of iterators whose interection we get documents from + * @var array + */ + var $index_bundle_iterators; + /** + * Number of elements in $this->index_bundle_iterators + * @var int + */ + var $num_iterators; + + /** + * The number of documents in the current block before filtering + * by restricted words + * @var int + */ + var $count_block_unfiltered; + /** + * The number of documents in the current block after filtering + * by restricted words + * @var int + */ + var $count_block; + + /** + * The number of iterated docs before the restriction test + * @var int + */ + var $seen_docs_unfiltered; + + + /** + * Creates a union iterator with the given parameters. + * + * @param object $index_bundle_iterator to use as a source of documents + * to iterate over + * @param int $limit the first element to return from the list of docs + * iterated over + */ + function __construct($index_bundle_iterators, $limit = 0) + { + $this->index_bundle_iterators = $index_bundle_iterators; + $this->limit = $limit; + /* + estimate number of results by sum of all iterator counts, + then improve estimate as iterate + */ + $this->num_iterators = count($index_bundle_iterators); + $this->num_docs = 0; + for($i = 0; $i < $this->num_iterators; $i++) { + $this->num_docs += $this->index_bundle_iterators[$i]->num_docs; + } + $this->reset(); + } + + /** + * Returns the iterators to the first document block that it could iterate + * over + */ + function reset() + { + foreach($this->index_bundle_iterators as $iterator) { + $iterator->reset(); + } + + $this->seen_docs = 0; + $this->seen_docs_unfiltered = 0; + $beneath_limit = true; + while($beneath_limit == true) { + $doc_block = $this->currentDocsWithWord(); + if($doc_block == -1 || !is_array($doc_block)) { + $beneath_limit = false; + continue; + } + if($this->seen_docs + $this->count_block >= $this->limit) { + $beneath_limit = false; + continue; + } + $this->advance(); + } + } + + /** + * Hook function used by currentDocsWithWord to return the current block + * of docs if it is not cached + * + * @return mixed doc ids and score if there are docs left, -1 otherwise + */ + function findDocsWithWord() + { + $pages = array(); + $docs = array(); + $high_score = array(); + $high_score = array(); + $found_docs = false; + for($i = 0; $i < $this->num_iterators; $i++) { + $docs = $this->index_bundle_iterators[$i]->currentDocsWithWord(); + if(is_array($docs)) { + $doc_keys = array_keys($docs); + foreach($doc_keys as $key) { + $docs[$key]["ITERATOR"] = $i; + } + $pages = array_merge($pages, $docs); + $found_docs = true; + } + + } + if($found_docs == false) { + $this->pages = $docs; + return $docs; + } + $this->count_block_unfiltered = count($pages); + $this->pages = $pages; + $this->count_block = count($pages); + return $pages; + } + + /** + * Gets the summaries associated with the keys provided the keys + * can be found in the current block of docs returned by this iterator + * @param array $keys keys to try to find in the current block of returned + * results + * @return array doc summaries that match provided keys + */ + function getSummariesFromCurrentDocs($keys = NULL) + { + if($this->current_block_fresh == false) { + $result = $this->currentDocsWithWord(); + if(!is_array($result)) { + return $result; + } + } + if(!is_array($this->pages)) { + return $this->pages; + } + if($keys == NULL) { + $keys = array_keys($this->pages); + } + $out_pages = array(); + echo "hello".$this->pages[$key[0]]["ITERATOR"]."<br/>"; + foreach($keys as $doc_key) { + if(!isset($this->pages[$doc_key]["ITERATOR"])) { + continue; + } else { + $out_pages[$doc_key] = $this->index_bundle_iterators[ + $this->pages[ + $doc_key]["ITERATOR"]]->getSummariesFromCurrentDocs( + array($doc_key)); + } + } + return $out_pages; + } + + /** + * Forwards the iterator one group of docs + */ + function advance() + { + $this->advanceSeenDocs(); + + $this->seen_docs_unfiltered += $this->count_block_unfiltered; + + $total_num_docs = 0; + for($i = 0; $i < $this->num_iterators; $i++) { + $total_num_docs += $this->index_bundle_iterators[$i]->num_docs; + $this->index_bundle_iterators[$i]->advance(); + } + if($this->seen_docs_unfiltered > 0) { + $this->num_docs = + floor(($this->seen_docs * $total_num_docs) / + $this->seen_docs_unfiltered); + } else { + $this->num_docs = 0; + } + } + + /** + * Returns the index associated with this iterator + * @return object the index + */ + function getIndex($key = NULL) + { + if($key != NULL) { + if($this->current_block_fresh == false) { + $result = $this->currentDocsWithWord(); + if(!is_array($result)) { + return $this->index_bundle_iterators[0]->getIndex($key); + } + } + if(!isset($this->pages[$key]["ITERATOR"])) { + return $this->index_bundle_iterators[0]->getIndex($key); + } + return $this->index_bundle_iterators[ + $this->pages[$key]["ITERATOR"]]->getIndex($key); + } else { + return $this->index_bundle_iterators[0]->getIndex($key); + } + } +} +?> diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php new file mode 100644 index 000000000..7512d2a4a --- /dev/null +++ b/lib/index_bundle_iterators/word_iterator.php @@ -0,0 +1,399 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage library + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + *Loads common constants for word indexing + */ +require_once BASE_DIR.'/lib/indexing_constants.php'; + +/** + *Loads base class for iterating + */ +require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php'; + +/** + * Used to iterate through the documents associated with a word in + * an IndexArchiveBundle. It also makes it easy to get the summaries + * of these documents. + * + * A description of how words and the documents containing them are stored + * is given in the documentation of IndexArchiveBundle. To iterate over + * all documents containng a word, its hash, work_key, is formed. Then using + * the Bloom filter for that partition, it is determined if the word is stored + * at all, and if it is, which generations it occurs in. Then the iterator + * is set to point to the first block of the first generation the word appears + * in that is greater than the limit of the WordIterator. Thereafter, + * nextDocsWithWord will advance $this->current_pointer by one per call. + * $this->current_pointer keeps track of which block of documents containing + * the word to return. If it is less than COMMON_WORD_THRESHOLD/BLOCK_SIZE and + * there are still more blocks, then the corresponding block_pointer of the word + * from the generation's partition info_block is used to look up the offset to + * the doc block. If it is greater than this value then the linked list + * of doc blocks pointed to for the partition is followed to get the appropriate + * block. This list is in the order that words were stored in the index so + * LIST_OFFSET points to the last block stored, which in turn points to the + * next to last block, etc. Finally, when all the blocks in the linked-list are + * exhausted, the remaining docs for that generation for that word are stored + * in the info block for the word itself (this will always be less than + * BLOCK_SIZE many). Once all the docs for a word for a generation have been + * iterated through, than iteration proceeds to the next generation containing + * the word. + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage library + * @see IndexArchiveBundle + */ +class WordIterator extends IndexBundleIterator +{ + /** + * hash of word that the iterator iterates over + * @var string + */ + var $word_key; + /** + * The IndexArchiveBundle this index is associated with + * @var object + */ + var $index; + + /** + * If iterating through the linked-list portions of the documents + * the next byte offset in the WebArchive based linked-list + * @var int + */ + var $next_offset; + /** + * Block number of the last block of docs + * @var int + */ + var $last_pointed_block; + /** + * @var int + */ + var $list_offset; + + /** + * Pointers to offsets for blocks containing docs with the given word + * for the current generation + * @var array + */ + var $block_pointers; + /** + * Number of completely full blocks of documents for the current generation + * @var int + */ + var $num_full_blocks; + /** + * Number of generations word appears in + * @var int + */ + var $num_generations; + /** + * Used to store the contents of the last partially full block + * @var int + */ + var $last_block; + /** + * the info block of the WebArchive that the word lives in + * @var object + */ + var $info_block; + /** + * Stores the number of the current block of documents we are at in the + * set of all blocks of BLOCK_SIZE many documents + * @var int + */ + var $current_pointer; + + /** + * Creates a word iterator with the given parameters. + * + * @param string $word_key hash of word or phrase to iterate docs of + * @param object $index the IndexArchiveBundle to use + * @param int $limit the first element to return from the list of docs + * iterated over + * @param object $info_block the info block of the WebArchive + * associated with the word in the index. If NULL, then this will + * loaded in WordIterator::reset() + */ + function __construct($word_key, $index, $limit = 0, $info_block = NULL) + { + $this->word_key = $word_key; + $this->index = $index; + $this->limit = $limit; + $this->info_block = $info_block; + $this->current_block_fresh = false; + $this->reset(); + } + + /** + * Returns the iterators to the first document block that it could iterate + * over + * + */ + function reset() + { + $this->count_block = 0; + $this->seen_docs = 0; + + $partition = + WebArchiveBundle::selectPartition($this->word_key, + $this->index->num_partitions_index); + if($this->info_block == NULL) { + $this->info_block = + $this->index->getPhraseIndexInfo($this->word_key); + } + if($this->info_block !== NULL) { + $this->num_generations = count($this->info_block['GENERATIONS']); + $count_till_generation = $this->info_block[self::COUNT]; + + while($this->limit >= $count_till_generation) { + $this->info_block['CURRENT_GENERATION_INDEX']++; + if($this->num_generations <= + $this->info_block['CURRENT_GENERATION_INDEX']) { + $this->num_docs = 0; + $this->current_pointer = -1; + return; + } + $info_block = $this->index->getPhraseIndexInfo( + $this->word_key, + $this->info_block['CURRENT_GENERATION_INDEX'], + $this->info_block); + if($info_block !== NULL) { + $this->info_block = $info_block; + } + $count_till_generation += $this->info_block[self::COUNT]; + } + $this->seen_docs = $count_till_generation - + $this->info_block[self::COUNT]; + + } + + + $this->initGeneration(); + + + } + + /** + * Sets up the iterator to iterate through the current generation. + * + * @return bool whether the initialization succeeds + */ + function initGeneration() + { + + if($this->info_block !== NULL) { + $info_block = $this->index->getPhraseIndexInfo( + $this->word_key, $this->info_block['CURRENT_GENERATION_INDEX'], + $this->info_block); + if($info_block === NULL) { + return false; + } + $this->info_block = & $info_block; + $this->num_docs = $info_block['TOTAL_COUNT']; + $this->num_docs_generation = $info_block[self::COUNT]; + + $this->current_pointer = + max(floor(($this->limit - $this->seen_docs) / BLOCK_SIZE), 0); + $this->seen_docs += $this->current_pointer*BLOCK_SIZE; + $this->last_block = $info_block[self::END_BLOCK]; + $this->num_full_blocks = + floor($this->num_docs_generation / BLOCK_SIZE); + if($this->num_docs_generation > COMMON_WORD_THRESHOLD) { + $this->last_pointed_block = + floor(COMMON_WORD_THRESHOLD / BLOCK_SIZE); + } else { + $this->last_pointed_block = $this->num_full_blocks; + } + + for($i = 0; $i < $this->last_pointed_block; $i++) { + if(isset($info_block[$i])) { + $this->block_pointers[$i] = $info_block[$i]; + } + } + + if($this->num_docs_generation > COMMON_WORD_THRESHOLD) { + if($info_block[self::LIST_OFFSET] === NULL) { + $this->list_offset = NULL; + } else { + $this->list_offset = $info_block[self::LIST_OFFSET]; + } + } + + } else { + $this->num_docs = 0; + $this->num_docs_generation = 0; + $this->current_pointer = -1; + } + return true; + } + + /** + * Hook function used by currentDocsWithWord to return the current block + * of docs if it is not cached + * + * @return mixed doc ids and score if there are docs left, -1 otherwise + */ + function findDocsWithWord() + { + if($this->num_generations <= + $this->info_block['CURRENT_GENERATION_INDEX']) { + $this->pages = NULL; + return -1; + } + $generation = + $this->info_block['GENERATIONS'][ + $this->info_block['CURRENT_GENERATION_INDEX']]; + if($this->current_pointer >= 0) { + if($this->current_pointer == $this->num_full_blocks) { + $pages = $this->last_block; + } else if ($this->current_pointer >= $this->last_pointed_block) { + /* if there are more than COMMON_WORD_THRESHOLD many + results and we're not at the last block yet + */ + if($this->list_offset === NULL) { + $this->pages = NULL; + return -1; + } + $offset = $this->list_offset; + $found = false; + do { + /* the link list is actually backwards to the order we want + For now, we cycle along the list from the last data + stored until we find the block we want. This is slow + but we are relying on the fact that each generation is + not too big. + */ + $doc_block = $this->index->getWordDocBlock($this->word_key, + $offset, $generation); + $word_keys = array_keys($doc_block); + $found_key = NULL; + foreach($word_keys as $word_key) { + if(strstr($word_key, $this->word_key.":")) { + $found_key = $word_key; + if(isset($doc_block[ + $found_key][self::LIST_OFFSET])) { + //only one list offset/docblock + break; + } + } + } + if($found_key === NULL) { + break; + } + if(isset($doc_block[ + $this->word_key.":".$this->current_pointer])) { + $found = true; + break; + } + $offset = $doc_block[$found_key][self::LIST_OFFSET]; + } while($offset != NULL); + if($found != true) { + $pages = array(); + } else { + $pages = & $doc_block[ + $this->word_key.":".$this->current_pointer]; + } + } else { + //first COMMON_WORD_THRESHOLD many results fast + if(isset($this->block_pointers[$this->current_pointer])) { + $doc_block = $this->index->getWordDocBlock($this->word_key, + $this->block_pointers[$this->current_pointer], + $generation); + if(isset( + $doc_block[$this->word_key.":".$this->current_pointer] + )) { + $pages = & + $doc_block[ + $this->word_key.":".$this->current_pointer]; + } else { + $pages = array(); + } + } else { + $pages = array(); + } + } + + if($this->seen_docs < $this->limit) { + $diff_offset = $this->limit - $this->seen_docs; + + $pages = array_slice($pages, $diff_offset); + } + $this->pages = & $pages; + $this->count_block = count($pages); + return $pages; + } else { + $this->pages = NULL; + return -1; + } + } + + + /** + * Forwards the iterator one group of docs + */ + function advance() + { + if($this->current_pointer < 0) {return;} + + $this->advanceSeenDocs(); + + $this->current_pointer ++; + if($this->current_pointer > $this->num_full_blocks) { + $flag = false; + while ($this->info_block['CURRENT_GENERATION_INDEX'] < + $this->num_generations - 1 && !$flag) { + $this->info_block['CURRENT_GENERATION_INDEX']++; + $flag = $this->initGeneration(); + } + if ($this->info_block['CURRENT_GENERATION_INDEX'] >= + $this->num_generations - 1) { + $this->current_pointer = - 1; + } + } + } + + /** + * Returns the index associated with this iterator + * @return object the index + */ + function getIndex($key = NULL) + { + return $this->index; + } +} +?> diff --git a/lib/persistent_structure.php b/lib/persistent_structure.php index 3f6f7c3b6..40594e750 100755 --- a/lib/persistent_structure.php +++ b/lib/persistent_structure.php @@ -60,7 +60,7 @@ class PersistentStructure * @var int */ var $unsaved_operations; - /** Number of operation between saves + /** Number of operation between saves. If == -1 never save * @var int */ var $save_frequency; @@ -71,7 +71,8 @@ class PersistentStructure * * @param string $fname the name of the file to store the * PersistentStructure in - * @param int $save_frequency the number of operation before a save + * @param int $save_frequency the number of operation before a save If + * <= 0 never save */ public function __construct($fname, $save_frequency = self::DEFAULT_SAVE_FREQUENCY) @@ -107,7 +108,8 @@ class PersistentStructure function checkSave() { $this->unsaved_operations++; - if($this->unsaved_operations >= $this->save_frequency) { + if($this->save_frequency > 0 && + $this->unsaved_operations >= $this->save_frequency) { $this->save(); $this->unsaved_operations = 0; } diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php index c22b1b664..967859823 100755 --- a/lib/phrase_parser.php +++ b/lib/phrase_parser.php @@ -67,17 +67,8 @@ class PhraseParser $page[CrawlConstants::TITLE]); $description_phrase_string = mb_ereg_replace("[[:punct:]]", " ", $page[CrawlConstants::DESCRIPTION]); - $link_phrase_string = ""; - $link_urls = array(); - foreach($page[CrawlConstants::LINKS] as $url => $link_text) { - $link_phrase_string .= " $link_text"; - } - - $link_phrase_string = mb_ereg_replace("[[:punct:]]", " ", - $link_phrase_string); - $page_string = $title_phrase_string . " " . $description_phrase_string . - " " . $link_phrase_string; + $page_string = $title_phrase_string . " " . $description_phrase_string; $page_string = preg_replace("/(\s)+/", " ", $page_string); return $page_string; diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index bc27150f3..1b18d3448 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -52,7 +52,7 @@ require_once BASE_DIR."/lib/url_parser.php"; */ class HtmlProcessor extends TextProcessor { - const MAX_DESCRIPTION_LEN = 3000; + const MAX_DESCRIPTION_LEN = 2000; /** diff --git a/lib/processors/image_processor.php b/lib/processors/image_processor.php index 009e6917b..fbe9ec4be 100755 --- a/lib/processors/image_processor.php +++ b/lib/processors/image_processor.php @@ -73,7 +73,7 @@ abstract class ImageProcessor implements CrawlConstants static function createThumb($image) { $thumb = imagecreatetruecolor(50, 50); - if( isset($image) && $image == false ) { + if( isset($image) && $image !== false ) { $size_x = imagesx($image); $size_y = imagesy($image); diff --git a/lib/url_parser.php b/lib/url_parser.php index be5623dfd..dcf039ac5 100755 --- a/lib/url_parser.php +++ b/lib/url_parser.php @@ -114,7 +114,7 @@ class UrlParser * @param string $url the url to parse * @return the host portion of the url if present; NULL otherwise */ - public static function getPath($url) + static function getPath($url) { $url_parts = @parse_url($url); if(!isset($url_parts['path'])) { @@ -134,7 +134,7 @@ class UrlParser * @param string $url the url to extract prefixes from * @return array the array of url prefixes */ - public static function getHostPaths($url) + static function getHostPaths($url) { $host_paths = array($url); @@ -162,6 +162,34 @@ class UrlParser } + /** + * Gets the subdomains of the host portion of a url. So + * + * http://a.b.c/d/f/ + * will return a.b.c, .a.b.c, b.c, .b.c, c, .c + * + * @param string $url the url to extract prefixes from + * @return array the array of url prefixes + */ + static function getHostSubdomains($url) + { + $subdomains = array(); + $url_parts = @parse_url($url); + if(strlen($url_parts['host']) <= 0) { return $subdomains; } + $host = $url_parts['host']; + $host_parts = explode(".", $host); + $num_parts = count($host_parts); + $domain = ""; + for($i = $num_parts - 1; $i >= 0 ; $i--) { + $domain = $host_parts[$i].$domain; + $subdomains[] = $domain; + $domain = ".$domain"; + $subdomains[] = $domain; + } + + return $subdomains; + } + /** * Given a url, makes a guess at the file type of the file it points to * @@ -312,7 +340,13 @@ class UrlParser $path2 = str_replace("//","/", $path); } while($path != $path2); - $path = str_replace("/./","/", $path); + $path = str_replace("/./","/", $path); + if($path == "." || substr($path, -2) == "/.") { + $path = "/"; + } + if($path == "") { + $path = "/"; + } $url = $host.$path; diff --git a/lib/utility.php b/lib/utility.php index 55294e4a4..376f3cc08 100755 --- a/lib/utility.php +++ b/lib/utility.php @@ -215,6 +215,22 @@ function scoreOrderCallback($word_doc_a, $word_doc_b) (float)$word_doc_b[CrawlConstants::SCORE]) ? -1 : 1; } +/** + * Callback function used to sort documents by doc_rank + * + * The function is used to sort documents being added to an IndexArchiveBundle + * + * @param string $word_doc_a doc id of first document to compare + * @param string $word_doc_b doc id of second document to compare + * @return int -1 if first doc bigger 1 otherwise + * @see IndexArchiveBundle::addPartitionWordData() + */ +function docRankOrderCallback($word_doc_a, $word_doc_b) +{ + return ((float)$word_doc_a[CrawlConstants::DOC_RANK] > + (float)$word_doc_b[CrawlConstants::DOC_RANK]) ? -1 : 1; +} + /** * Callback to check if $a is less than $b * diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php index 4f56f6b77..7909e0d29 100755 --- a/lib/web_archive_bundle.php +++ b/lib/web_archive_bundle.php @@ -145,7 +145,7 @@ class WebArchiveBundle } //store/read archive description - $info = NULL; + if(file_exists($dir_name."/description.txt")) { $info = unserialize( file_get_contents($this->dir_name."/description.txt")); @@ -170,8 +170,6 @@ class WebArchiveBundle } } - $info = array(); - $info['DESCRIPTION'] = $this->description; $info['NUM_PARTITIONS'] = $this->num_partitions; $info['COUNT'] = $this->count; @@ -418,17 +416,36 @@ class WebArchiveBundle return $this->partition[$index]; } + /** + * Creates a new counter to be maintain in the description.txt + * file if the counter doesn't exist, leaves unchanged otherwise + * + * @param string $field field of info struct to add a counter for + */ + function initCountIfNotExists($field = "COUNT") + { + $info = + unserialize(file_get_contents($this->dir_name."/description.txt")); + if(!isset($info[$field])) { + $info[$field] = 0; + } + file_put_contents($this->dir_name."/description.txt", serialize($info)); + } + /** * Updates the description file with the current count for the number of - * items in the WebArchiveBundle + * items in the WebArchiveBundle. If the $field item is used counts of + * additional properties (visited urls say versus total urls) can be + * maintained. * * @param int $num number of items to add to current count + * @param string $field field of info struct to add to the count of */ - function addCount($num) + function addCount($num, $field = "COUNT") { $info = unserialize(file_get_contents($this->dir_name."/description.txt")); - $info['COUNT'] += $num; + $info[$field] += $num; file_put_contents($this->dir_name."/description.txt", serialize($info)); } diff --git a/locale/en-US/configure.ini b/locale/en-US/configure.ini index 265df66b2..4b38bc799 100755 --- a/locale/en-US/configure.ini +++ b/locale/en-US/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "Please Describe Your Robot" ; search_controller.php line: 119 search_controller_logout_successful = "Logout Successful!!" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "This cached version of %s was obtained by the Yioop crawler on %s." ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "Time started:" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "No start time found" ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "Total Urls Seen:" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "Visited Urls Count:" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "Total Urls Extracted:" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "Most Recent Fetcher:" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "No Fetcher Queries Yet" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "Most Recent Urls" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "No Recent Urls" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "Previous Crawls" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "Description:" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "Time started:" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "Total Urls Seen:" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "Visited/Extracted Urls" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "Actions:" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "Resume" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "Set as Index" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "Search Index" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "Delete" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "No Previous Crawls" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "Rel: %s " ; search_view.php line: 133 search_view_score = "Score %s" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "Cached" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "View as text" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "Similar" ; +; search_view.php line: 157 +search_view_inlink = "Inlinks" +; ; settings_view.php line: 76 settings_view_settings = "Settings" ; diff --git a/locale/en-US/statistics.txt b/locale/en-US/statistics.txt index 5a165df53..b6bef56f0 100755 --- a/locale/en-US/statistics.txt +++ b/locale/en-US/statistics.txt @@ -1 +1 @@ -d:100; \ No newline at end of file +d:99; \ No newline at end of file diff --git a/locale/fr-FR/configure.ini b/locale/fr-FR/configure.ini index 251692d5c..6d7c9acab 100755 --- a/locale/fr-FR/configure.ini +++ b/locale/fr-FR/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "" ; search_controller.php line: 119 search_controller_logout_successful = "" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "" ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "" ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "Pertinence: %s" ; search_view.php line: 133 search_view_score = "Total: %s" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "En Cache" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "Version texte" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "Pages similaires" ; +; search_view.php line: 157 +search_view_inlink = "" +; ; settings_view.php line: 76 settings_view_settings = "Préférences" ; diff --git a/locale/he/configure.ini b/locale/he/configure.ini index 07c05e0f1..555e6e092 100755 --- a/locale/he/configure.ini +++ b/locale/he/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "" ; search_controller.php line: 119 search_controller_logout_successful = "" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "" ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "" ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "" ; search_view.php line: 133 search_view_score = "" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "" ; +; search_view.php line: 157 +search_view_inlink = "" +; ; settings_view.php line: 76 settings_view_settings = "הגדרות" ; diff --git a/locale/in-ID/configure.ini b/locale/in-ID/configure.ini index 16a90626b..dd9744670 100755 --- a/locale/in-ID/configure.ini +++ b/locale/in-ID/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "" ; search_controller.php line: 119 search_controller_logout_successful = "Logout berhasil" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "" ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "" ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "" ; search_view.php line: 133 search_view_score = "" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "" ; +; search_view.php line: 157 +search_view_inlink = "" +; ; settings_view.php line: 76 settings_view_settings = "" ; diff --git a/locale/ja/configure.ini b/locale/ja/configure.ini index 01eb7f43f..4da074c08 100755 --- a/locale/ja/configure.ini +++ b/locale/ja/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "ロボットの説明してください。" ; search_controller.php line: 119 search_controller_logout_successful = "ログアウト成功" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "%sのこのキャッシュされたバージョンは%sのウィオップから入手しました。" ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "始まった時の時間" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "検索始まった時間は見つけない" ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "全部URL" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "全部URL" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "最新フェッチャ" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "フェッチャキュエリはまだありません" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "最新URL" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "最近URLはありません" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "さっきの検索" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "説明" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "始まった時の時間" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "全部URL" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "アクション" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "再会" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "指数の設定する。" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "検索指数" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "削除" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "さっきの検索はありません" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "関連:%s" ; search_view.php line: 133 search_view_score = "スコア %s" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "キャッシューしました。" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "テクストビュー" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "同じビュー" ; +; search_view.php line: 157 +search_view_inlink = "" +; ; settings_view.php line: 76 settings_view_settings = "設定" ; diff --git a/locale/ja/statistics.txt b/locale/ja/statistics.txt index 5a165df53..eedae9b06 100755 --- a/locale/ja/statistics.txt +++ b/locale/ja/statistics.txt @@ -1 +1 @@ -d:100; \ No newline at end of file +d:98; \ No newline at end of file diff --git a/locale/ko/configure.ini b/locale/ko/configure.ini index e6d3569e9..81951938c 100755 --- a/locale/ko/configure.ini +++ b/locale/ko/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "" ; search_controller.php line: 119 search_controller_logout_successful = "로그 아웃 성공!!" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "현재 캐시 버젼 %s 은 Yioop 크롤 %s 에 의하여 얻어 졌습니다. " ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "시작한 시간:" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "시작 시간이 존재하지 않습니다." ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "지금까지 본 총 합계 주소(URLs):" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "지금까지 본 총 합계 주소(URLs):" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "설명:" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "시작한 시간:" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "지금까지 본 총 합계 주소(URLs):" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "관련성: %s " ; search_view.php line: 133 search_view_score = "점수 %s" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "캐시 됀것" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "일반 텍스트로써 보기" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "유사성" ; +; search_view.php line: 157 +search_view_inlink = "" +; ; settings_view.php line: 76 settings_view_settings = "세팅" ; diff --git a/locale/ko/statistics.txt b/locale/ko/statistics.txt index 187cb44f0..b26155324 100755 --- a/locale/ko/statistics.txt +++ b/locale/ko/statistics.txt @@ -1 +1 @@ -d:32; \ No newline at end of file +d:31; \ No newline at end of file diff --git a/locale/rn-US/configure.ini b/locale/rn-US/configure.ini index 03f846c9d..63b95bb17 100755 --- a/locale/rn-US/configure.ini +++ b/locale/rn-US/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "" ; search_controller.php line: 119 search_controller_logout_successful = "Logout Successful!" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "" ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "" ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "" ; search_view.php line: 133 search_view_score = "" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "" ; +; search_view.php line: 157 +search_view_inlink = "" +; ; settings_view.php line: 76 settings_view_settings = "" ; diff --git a/locale/th/configure.ini b/locale/th/configure.ini index e255988bf..123f3d571 100755 --- a/locale/th/configure.ini +++ b/locale/th/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "" ; search_controller.php line: 119 search_controller_logout_successful = "" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "" ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "" ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "" ; search_view.php line: 133 search_view_score = "" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "" ; +; search_view.php line: 157 +search_view_inlink = "" +; ; settings_view.php line: 76 settings_view_settings = "" ; diff --git a/locale/vi-VN/configure.ini b/locale/vi-VN/configure.ini index e70ca3884..39dbad5a2 100755 --- a/locale/vi-VN/configure.ini +++ b/locale/vi-VN/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "Diễn tả rô bô của bạn" ; search_controller.php line: 119 search_controller_logout_successful = "Thoát thành công" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "Trang gốc này: %s đã tìm được bởi công cụ tìm kiẽm Yioop vào ngày %s." ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "Thời gian bắt đầu:" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "Không tìm thấy thời gian bắt đầu" ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "Mô tả:" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "Thời gian bắt đầu:" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "Những hành động:" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "Bắt đầu trở lại" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "Cài làm mục lục" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "Tìm mục lục" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "Xoá" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "Thích hợp:" ; search_view.php line: 133 search_view_score = "Điểm: %s" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "Trang gốc" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "Trang Web Bắng Chữ" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "Tương Tự" ; +; search_view.php line: 157 +search_view_inlink = "" +; ; settings_view.php line: 76 settings_view_settings = "Sự sắp đặt" ; diff --git a/locale/vi-VN/statistics.txt b/locale/vi-VN/statistics.txt index 77bbfe053..2c43a0adb 100755 --- a/locale/vi-VN/statistics.txt +++ b/locale/vi-VN/statistics.txt @@ -1 +1 @@ -d:74; \ No newline at end of file +d:73; \ No newline at end of file diff --git a/locale/vn-US/configure.ini b/locale/vn-US/configure.ini index 7908df79c..ad67134e6 100755 --- a/locale/vn-US/configure.ini +++ b/locale/vn-US/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "" ; search_controller.php line: 119 search_controller_logout_successful = "" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "" ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "" ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "" ; search_view.php line: 133 search_view_score = "" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "" ; +; search_view.php line: 157 +search_view_inlink = "" +; ; settings_view.php line: 76 settings_view_settings = "" ; diff --git a/locale/zh-CN/configure.ini b/locale/zh-CN/configure.ini index b358f233b..7b9eb7f87 100755 --- a/locale/zh-CN/configure.ini +++ b/locale/zh-CN/configure.ini @@ -199,7 +199,7 @@ admin_controller_describe_robot = "" ; search_controller.php line: 119 search_controller_logout_successful = "" ; -; search_controller.php line: 366 +; search_controller.php line: 369 search_controller_cached_version = "" ; ; settings_controller.php line: 134 @@ -231,49 +231,52 @@ crawlstatus_view_time_started = "" ; crawlstatus_view.php line: 77 crawlstatus_view_no_crawl_time = "" ; -; crawlstatus_view.php line: 79 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 78 +crawlstatus_view_visited_urls = "" ; ; crawlstatus_view.php line: 82 +crawlstatus_view_total_urls = "" +; +; crawlstatus_view.php line: 85 crawlstatus_view_most_recent_fetcher = "" ; -; crawlstatus_view.php line: 88 +; crawlstatus_view.php line: 91 crawlstatus_view_no_fetcher = "" ; -; crawlstatus_view.php line: 91 +; crawlstatus_view.php line: 94 crawlstatus_view_most_recent_urls = "" ; -; crawlstatus_view.php line: 99 +; crawlstatus_view.php line: 102 crawlstatus_view_no_recent_urls = "" ; -; crawlstatus_view.php line: 103 +; crawlstatus_view.php line: 106 crawlstatus_view_previous_crawls = "" ; -; crawlstatus_view.php line: 109 +; crawlstatus_view.php line: 112 crawlstatus_view_description = "" ; -; crawlstatus_view.php line: 110 +; crawlstatus_view.php line: 113 crawlstatus_view_time_started = "" ; -; crawlstatus_view.php line: 111 -crawlstatus_view_total_urls = "" +; crawlstatus_view.php line: 114 +crawlstatus_view_url_counts = "" ; -; crawlstatus_view.php line: 112 +; crawlstatus_view.php line: 115 crawlstatus_view_actions = "" ; -; crawlstatus_view.php line: 121 +; crawlstatus_view.php line: 126 crawlstatus_view_resume = "" ; -; crawlstatus_view.php line: 127 +; crawlstatus_view.php line: 132 crawlstatus_view_set_index = "" ; -; crawlstatus_view.php line: 130 +; crawlstatus_view.php line: 135 crawlstatus_view_search_index = "" ; -; crawlstatus_view.php line: 137 +; crawlstatus_view.php line: 142 crawlstatus_view_delete = "" ; -; crawlstatus_view.php line: 144 +; crawlstatus_view.php line: 149 crawlstatus_view_no_previous_crawl = "" ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements @@ -582,15 +585,18 @@ search_view_relevancy = "" ; search_view.php line: 133 search_view_score = "分數" ; -; search_view.php line: 142 +; search_view.php line: 144 search_view_cache = "" ; -; search_view.php line: 145 +; search_view.php line: 147 search_view_as_text = "" ; -; search_view.php line: 151 +; search_view.php line: 153 search_view_similar = "" ; +; search_view.php line: 157 +search_view_inlink = "" +; ; settings_view.php line: 76 settings_view_settings = "設定" ; diff --git a/locale/zh-CN/statistics.txt b/locale/zh-CN/statistics.txt index b26155324..039ce78b2 100755 --- a/locale/zh-CN/statistics.txt +++ b/locale/zh-CN/statistics.txt @@ -1 +1 @@ -d:31; \ No newline at end of file +d:30; \ No newline at end of file diff --git a/models/crawl_model.php b/models/crawl_model.php index 1cf075d31..03d9d95a2 100755 --- a/models/crawl_model.php +++ b/models/crawl_model.php @@ -191,6 +191,9 @@ class CrawlModel extends Model implements CrawlConstants substr($pre_timestamp, strlen(self::index_data_base_name)); $info = IndexArchiveBundle::getArchiveInfo($dir); $crawl['DESCRIPTION'] = $info['DESCRIPTION']; + $crawl['VISITED_URLS_COUNT'] = + isset($info['VISITED_URLS_COUNT']) ? + $info['VISITED_URLS_COUNT'] : 0; $crawl['COUNT'] = $info['COUNT']; $crawl['NUM_PARTITIONS'] = $info['NUM_PARTITIONS']; $list[] = $crawl; diff --git a/models/phrase_model.php b/models/phrase_model.php index eb0d38795..5cf646d6d 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -46,6 +46,14 @@ require_once BASE_DIR."/lib/utility.php"; */ require_once BASE_DIR."/lib/index_archive_bundle.php"; +/** + * Load iterators to get docs out of index archive + */ +foreach(glob(BASE_DIR."/lib/index_bundle_iterators/*_iterator.php") + as $filename) { + require_once $filename; +} + /** * * This is class is used to handle @@ -89,44 +97,106 @@ class PhraseModel extends Model $format = true) { - $index_archive_name = self::index_data_base_name . $this->index_name; + $results = NULL; + $word_structs = array(); + /* + this is a quick and dirty parsing and will usually work, + exceptions would be | in quotes or if someone tried + to escape |. + */ + $disjunct_phrases = explode("|", $phrase); + foreach($disjunct_phrases as $disjunct) { + list($word_struct, $format_words) = + $this->parseWordStructConjunctiveQuery($disjunct); + if($word_struct != NULL) { + $word_structs[] = $word_struct; + } + } + + $results = $this->getSummariesByHash($word_structs, + $low, $results_per_page); + if(count($results) == 0) { + $results = NULL; + } + if($results == NULL) { + $results['TOTAL_ROWS'] = 0; + } + + if($format) { + if(count($format_words) == 0 ){ + $format_words = NULL; + } + } else { + $format_words = NULL; + } + + + $output = $this->formatPageResults($results, $format_words); + + return $output; + } + + + function parseWordStructConjunctiveQuery($phrase) + { + $phrase = " ".$phrase; + $phrase_string = $phrase; + $meta_words = array('link\:', 'site\:', + 'filetype\:', 'info\:', '\-', + 'index:', 'i:', 'weight:', 'w:'); + $index_name = $this->index_name; + $weight = 1; + $found_metas = array(); + $disallow_phrases = array(); + foreach($meta_words as $meta_word) { + $pattern = "/(\s)($meta_word(\S)+)/"; + preg_match_all($pattern, $phrase, $matches); + if(in_array($meta_word, array('link\:', 'site\:', + 'filetype\:', 'info\:') )) { + $found_metas = array_merge($found_metas, $matches[2]); + } else if($meta_word == '\-') { + if(count($matches[0]) > 0) { + $disallow_phrases = + array_merge($disallow_phrases, + array(substr($matches[2][0],2))); + } + } else if ($meta_word == "i:" || $meta_word == "index:") { + if(isset($matches[2][0])) { + $index_name = substr($matches[2][0],strlen($meta_word)); + } + } else if ($meta_word == "w:" || $meta_word == "weight:") { + if(isset($matches[2][0])) { + $weight = substr($matches[2][0],strlen($meta_word)); + } + } + $phrase_string = preg_replace($pattern,"", $phrase_string); + } + + $index_archive_name = self::index_data_base_name . $index_name; $index_archive = new IndexArchiveBundle( CRAWL_DIR.'/cache/'.$index_archive_name); - $results = NULL; - - $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $phrase); + $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $phrase_string); $phrase_string = preg_replace("/(\s)+/", " ", $phrase_string); + /* we search using the stemmed words, but we format snippets in the results by bolding either */ $query_words = explode(" ", $phrase_string); //not stemmed - $words = + $base_words = array_keys(PhraseParser::extractPhrasesAndCount($phrase_string)); //stemmed + + $words = array_merge($base_words, $found_metas); if(isset($words) && count($words) == 1) { $phrase_string = $words[0]; - } - $phrase_hash = crawlHash($phrase_string); - - $phrase_info = $index_archive->getPhraseIndexInfo($phrase_hash); - if(isset($phrase_info[IndexingConstants::PARTIAL_COUNT]) && - $phrase_info[IndexingConstants::PARTIAL_COUNT] < - $low + $results_per_page) { - $phrase_info = NULL; - } - - if($phrase_info != NULL) { - - $results = $index_archive->getSummariesByHash( - $phrase_hash, $low, $results_per_page, NULL, NULL, $phrase_info); - - if(count($results) == 0) { - $results = NULL; - } - + $phrase_hash = crawlHash($phrase_string); + $word_struct = array("KEYS" => array($phrase_hash), + "RESTRICT_PHRASES" => NULL, "DISALLOW_PHRASES" => NULL, + "WEIGHT" => $weight, "INDEX_ARCHIVE" => $index_archive + ); } else { /* handle strings in quotes @@ -138,13 +208,6 @@ class PhraseModel extends Model preg_match_all('/\"((?:[^\"\\\]|\\\\.)*)\"/', $phrase,$quoteds); if(isset($quoteds[1])) { $quoteds = $quoteds[1]; - foreach($quoteds as $quote_phrase) { - $hash_quote = crawlHash($quote_phrase); - if($index_archive->getPhraseIndexInfo($hash_quote) != NULL){ - $hash_quoteds[] = $hash_quote; - } - } - } //get a raw list of words and their hashes @@ -154,42 +217,42 @@ class PhraseModel extends Model $tmp = crawlHash($word); $hashes[] = $tmp; } - $hashes = array_merge($hashes, $hash_quoteds); + $restrict_phrases = array_merge($query_words, $quoteds); - - + $hashes = array_unique($hashes); $restrict_phrases = array_unique($restrict_phrases); - - $words_array = $index_archive->getSelectiveWords($hashes, 1); - $word_keys = array_keys($words_array); - $word_key = $word_keys[0]; - $count = $words_array[$word_key]; - if($count > 0 ) { - $results = $index_archive->getSummariesByHash( - $word_key, $low, $results_per_page, - $restrict_phrases, $phrase_hash); + $restrict_phrases = array_filter($restrict_phrases); + $words_array = $index_archive->getSelectiveWords($hashes, 10); + + if(is_array($words_array)) { + reset($words_array); + $word_key = key($words_array); + $word_count = $words_array[$word_key]; + foreach($words_array as $key => $count) { + if($count > 3 * $word_count) { + unset($words_array[$key]); + } + } + $word_keys = array_keys($words_array); + $word_struct = array("KEYS" => $word_keys, + "RESTRICT_PHRASES" => $restrict_phrases, + "DISALLOW_PHRASES" => $disallow_phrases, + "WEIGHT" => $weight, + "INDEX_ARCHIVE" => $index_archive + ); + if($word_count <= 0 ) { + $word_struct = NULL; + } + } else { + $word_struct = NULL; } } + $format_words = array_merge($query_words, $base_words); - if($results == NULL) { - $results['TOTAL_ROWS'] = 0; - } - - if($format) { - $formatted_words = array_merge($query_words, $words); - } else { - $formatted_words = NULL; - } - - - $output = $this->formatPageResults($results, $formatted_words); - - return $output; - + return array($word_struct, $format_words); } - /** * Given a page summary extract the words from it and try to find documents * which match the most relevant words. The algorithm for "relevant" is @@ -234,6 +297,93 @@ class PhraseModel extends Model } + /** + * Gets doc summaries of documents containing given words and meeting the + * additional provided criteria + * @param array $word_structs an array of word_structs. Here a word_struct + * is an associative array with at least the following fields + * KEYS -- an array of word keys + * RESTRICT_PHRASES -- an array of phrases the document must contain + * DISALLOW_PHRASES -- an array of words the document must not contain + * WEIGHT -- a weight to multiple scores returned from this iterator by + * INDEX_ARCHIVE -- an index_archive object to get results from + * @param int $limit number of first document in order to return + * @param int $num number of documents to return summaries of + * @param object $index_archive index archive to use to get summaries from + * @return array document summaries + */ + function getSummariesByHash($word_structs, $limit, $num) + { + + $iterators = array(); + foreach($word_structs as $word_struct) { + if(!is_array($word_struct)) { continue;} + $word_keys = $word_struct["KEYS"]; + $restrict_phrases = $word_struct["RESTRICT_PHRASES"]; + $disallow_phrases = $word_struct["DISALLOW_PHRASES"]; + $index_archive = $word_struct["INDEX_ARCHIVE"]; + $weight = $word_struct["WEIGHT"]; + $num_word_keys = count($word_keys); + if($num_word_keys < 1) {continue;} + + for($i = 0; $i < $num_word_keys; $i++) { + $word_iterators[$i] = + new WordIterator($word_keys[$i], $index_archive, 0); + } + if($num_word_keys == 1) { + $base_iterator = $word_iterators[0]; + } else { + $base_iterator = new IntersectIterator($word_iterators, 0); + } + if($restrict_phrases == NULL && $disallow_phrases == NULL && + $weight == 1) { + $iterators[] = $base_iterator; + } else { + $iterators[] = new PhraseFilterIterator($base_iterator, + $restrict_phrases, $disallow_phrases, $weight, 0); + } + + } + $num_iterators = count($iterators); + if( $num_iterators < 1) { + return NULL; + } else if($num_iterators == 1) { + $union_iterator = $iterators[0]; + } else { + $union_iterator = new UnionIterator($iterators, 0); + } + + $to_retrieve = $limit + max(2*$num, 200); + $group_iterator = new GroupIterator($union_iterator, 0); + $num_retrieved = 0; + $pages = array(); + while(is_array($next_docs = $group_iterator->nextDocsWithWord()) && + $num_retrieved < $to_retrieve) { + foreach($next_docs as $doc_key => $doc_info) { + $summary = & $doc_info[CrawlConstants::SUMMARY]; + unset($doc_info[CrawlConstants::SUMMARY]); + $pages[] = array_merge($doc_info, $summary); + $num_retrieved++; + if($num_retrieved >= $to_retrieve) { + + break 2; + } + } + } + uasort($pages, "scoreOrderCallback"); + $pages = array_slice($pages, $limit, $num); + if($num_retrieved < $to_retrieve && $limit<=$group_iterator->num_docs) { + $results['TOTAL_ROWS'] = $num_retrieved; + } else { + $results['TOTAL_ROWS'] = max($group_iterator->num_docs, + $num_retrieved); + /*num_docs is only approximate, so if gives contradictory info + use $num_retrieved */ + } + $results['PAGES'] = $pages; + return $results; + } + } ?> diff --git a/views/crawlstatus_view.php b/views/crawlstatus_view.php index b7a92e888..452c5e8f7 100755 --- a/views/crawlstatus_view.php +++ b/views/crawlstatus_view.php @@ -76,7 +76,10 @@ class CrawlstatusView extends View <?php if(isset($data['CRAWL_TIME'])) { e(date("r",$data['CRAWL_TIME'])); } else {e(tl('crawlstatus_view_no_crawl_time'));} ?></p> - + <p><b><?php e(tl('crawlstatus_view_visited_urls')); ?></b> <?php + if(isset($data['VISITED_URLS_COUNT'])) { + e($data['VISITED_URLS_COUNT']); } else {e("0");} + ?></p> <p><b><?php e(tl('crawlstatus_view_total_urls')); ?></b> <?php if(isset($data['COUNT'])) { e($data['COUNT']); } else {e("0");} ?></p> @@ -109,14 +112,16 @@ class CrawlstatusView extends View <table class="crawlstable"> <tr><th><?php e(tl('crawlstatus_view_description'));?></th><th><?php e(tl('crawlstatus_view_time_started')); ?></th> - <th><?php e(tl('crawlstatus_view_total_urls'));?></th> + <th><?php e(tl('crawlstatus_view_url_counts'));?></th> <th colspan="3"><?php e(tl('crawlstatus_view_actions'));?></th></tr> <?php foreach($data['RECENT_CRAWLS'] as $crawl) { ?> <tr><td><b><?php e($crawl['DESCRIPTION']); ?></b></td><td> <?php e(date("r", $crawl['CRAWL_TIME'])); ?></td> - <td> <?php e( $crawl['COUNT']); ?></td> + <td> <?php e( (isset($crawl["VISITED_URLS_COUNT"]) ? + $crawl['VISITED_URLS_COUNT'] : 0) ."/". + $crawl['COUNT']); ?></td> <td><a href="<?php e($base_url); ?>resume×tamp=<?php e($crawl['CRAWL_TIME']); ?>"><?php e(tl('crawlstatus_view_resume'));?></a></td> diff --git a/views/search_view.php b/views/search_view.php index 885eef536..9be7cc805 100755 --- a/views/search_view.php +++ b/views/search_view.php @@ -114,7 +114,10 @@ class SearchView extends View implements CrawlConstants foreach($data['PAGES'] as $page) {?> <div class='result'> <h2> - <a href="<?php e($page[self::URL]); ?>" ><?php + <a href="<?php if($page[self::TYPE] != "link") { + e($page[self::URL]); + } else + e(strip_tags($page[self::TITLE])); ?>" ><?php if(isset($page[self::THUMB]) && $page[self::THUMB] != 'NULL') { ?><img src="<?php e($page[self::THUMB]); ?>" alt="<?php e($page[self::TITLE]); ?>" /> <?php @@ -125,31 +128,44 @@ class SearchView extends View implements CrawlConstants ?></a></h2> <p><?php echo $page[self::DESCRIPTION]; ?></p> - <p class="echolink" ><?php e($page[self::URL]." "); + <p class="echolink" ><?php + e(substr($page[self::URL],0, 200)." "); e(tl('search_view_rank', - number_format($page[self::DOC_RANK], 2))); + number_format($page[self::DOC_RANK], 2))); + $page["WEIGHT"] = (isset($page["WEIGHT"])) ? + $page["WEIGHT"] : 1; e(tl('search_view_relevancy', - number_format(1.25*floatval($page[self::SCORE]) - - floatval($page[self::DOC_RANK]), 2) )); - e(tl('search_view_score', 1.25* $page[self::SCORE]));?> - <a href="?c=search&a=cache&q=<?php - e($data['QUERY']); ?>&arg=<?php - e(urlencode($page[self::URL])); - ?>&so=<?php e($page[self::SUMMARY_OFFSET]); - ?>&its=<?php e($data['its']); ?>" > - <?php - if($page[self::TYPE] == "text/html" || - stristr($page[self::TYPE], "image")) { - e(tl('search_view_cache')); + number_format((1.25*floatval($page[self::SCORE]) + - floatval($page[self::DOC_RANK])) + / $page["WEIGHT"] , 2) )); + e(tl('search_view_score', 1.25* $page[self::SCORE])); + if($page[self::TYPE] != "link") { + ?> + <a href="?c=search&a=cache&q=<?php + e($data['QUERY']); ?>&arg=<?php + e(urlencode($page[self::URL])); + ?>&so=<?php e($page[self::SUMMARY_OFFSET]); + ?>&its=<?php e($data['its']); ?>" > + <?php + if($page[self::TYPE] == "text/html" || + stristr($page[self::TYPE], "image")) { + e(tl('search_view_cache')); - } else { - e(tl('search_view_as_text')); - } - ?></a>. <a href="?c=search&a=related&arg=<?php - e(urlencode($page[self::URL])); ?>&so=<?php - e($page[self::SUMMARY_OFFSET]); - ?>&its=<?php e($data['its']); ?>" ><?php - e(tl('search_view_similar')); ?></a>.</p> + } else { + e(tl('search_view_as_text')); + } + ?></a>. <a href="?c=search&a=related&arg=<?php + e(urlencode($page[self::URL])); ?>&so=<?php + e($page[self::SUMMARY_OFFSET]); + ?>&its=<?php e($data['its']); ?>" ><?php + e(tl('search_view_similar')); + ?></a>. <a href="?c=search&q=<?php + e("link:".urlencode($page[self::URL])); ?>& + its=<?php e($data['its']); ?>" ><?php + e(tl('search_view_inlink')); + ?></a>.</p> + <?php + } ?> </div> <?php