diff --git a/bin/arc_tool.php b/bin/arc_tool.php index b78211e5f..e1761d2be 100644 --- a/bin/arc_tool.php +++ b/bin/arc_tool.php @@ -268,15 +268,15 @@ class ArcTool implements CrawlConstants $generation = 0; while($seen < $total && $generation < $num_generations) { $partition = $archive->getPartition($generation, false); - if($archive->count < $start && $seen < $start) { + if($partition->count < $start && $seen < $start) { $generation++; - $seen += $this->count; + $seen += $partition->count; continue; } $seen_generation = 0; - while($seen < $total && $seen_generation < $archive->count) { + while($seen < $total && $seen_generation < $partition->count) { $num_to_get = min($total - $seen, - $archive->count - $seen_generation, + $partition->count - $seen_generation, self::MAX_BUFFER_DOCS); $objects = $partition->nextObjects($num_to_get); $seen += $num_to_get; diff --git a/bin/fetcher.php b/bin/fetcher.php index fb9d40254..a088435c0 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -60,13 +60,17 @@ require_once BASE_DIR."/models/datasources/".DBMS."_manager.php"; */ require_once BASE_DIR."/lib/web_archive_bundle.php"; +/** get available archive iterators */ +foreach(glob(BASE_DIR."/lib/archive_bundle_iterators/*_bundle_iterator.php") + as $filename) { + require_once $filename; +} + /** get processors for different file types */ foreach(glob(BASE_DIR."/lib/processors/*_processor.php") as $filename) { require_once $filename; } -/** To support English language stemming of words (jumps, jumping --> jump)*/ -require_once BASE_DIR."/lib/porter_stemmer.php"; /** Used to manipulate urls*/ require_once BASE_DIR."/lib/url_parser.php"; /** Used to extract summaries from web pages*/ @@ -208,6 +212,21 @@ class Fetcher implements CrawlConstants */ var $crawl_order; + /** + * Indicates the kind of crawl being performed: self::WEB_CRAWL indicates + * a new crawl of the web; self::ARCHIVE_CRAWL indicates a crawl of an + * existing web archive + * @var string + */ + var $crawl_type; + + /** + * If the crawl_type is self::ARCHIVE_CRAWL, then crawl_index is the + * timestamp of the existing archive to crawl + * @var string + */ + var $crawl_index; + /** * Sets up the field variables for that crawling can begin * @@ -229,6 +248,9 @@ class Fetcher implements CrawlConstants $this->crawl_time = NULL; $this->schedule_time = NULL; + $this->crawl_type = self::WEB_CRAWL; + $this->crawl_index = NULL; + $this->to_crawl = array(); $this->to_crawl_again = array(); $this->found_sites = array(); @@ -275,8 +297,6 @@ class Fetcher implements CrawlConstants $this->checkCrawlTime(); while ($info[self::STATUS] != self::STOP_STATE) { - - $fetcher_message_file = CRAWL_DIR."/schedules/fetcher_messages.txt"; if(file_exists($fetcher_message_file)) { $info = unserialize(file_get_contents($fetcher_message_file)); @@ -286,6 +306,7 @@ class Fetcher implements CrawlConstants } $info = $this->checkScheduler(); + if(!isset($info[self::STATUS])) { $info[self::STATUS] = self::CONTINUE_STATE; } @@ -328,38 +349,19 @@ class Fetcher implements CrawlConstants $this->deleteOldCrawls($info[self::SAVED_CRAWL_TIMES]); } - $start_time = microtime(); - $can_schedule_again = false; - if(count($this->to_crawl) > 0) { - $can_schedule_again = true; - } - $sites = $this->getFetchSites(); - if(!$sites) { - crawlLog("No seeds to fetch..."); - sleep(max(0, ceil( - MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time)))); - continue; - } - - $site_pages = FetchUrl::getPages($sites, true); + switch($this->crawl_type) + { + case self::WEB_CRAWL: + $downloaded_pages = $this->downloadPagesWebCrawl(); + break; - list($downloaded_pages, $schedule_again_pages) = - $this->reschedulePages($site_pages); - - if($can_schedule_again == true) { - //only schedule to crawl again on fail sites without crawl-delay - foreach($schedule_again_pages as $schedule_again_page) { - if($schedule_again_page[self::CRAWL_DELAY] == 0) { - $this->to_crawl_again[] = - array($schedule_again_page[self::URL], - $schedule_again_page[self::WEIGHT], - $schedule_again_page[self::CRAWL_DELAY] - ); - } - } + case self::ARCHIVE_CRAWL: + $downloaded_pages = $this->downloadPagesArchiveCrawl(); + break; } $start_time = microtime(); + $summarized_site_pages = $this->processFetchPages($downloaded_pages); @@ -373,7 +375,93 @@ class Fetcher implements CrawlConstants crawlLog("Fetcher shutting down!!"); } - + + /** + * Get a list of urls from the current fetch batch provided by the queue + * server. Then downloads these pages. Finally, reschedules, if + * possible, pages that did not successfully get downloaded. + * + * @return array an associative array of web pages and meta data + * fetched from the internet + */ + function downloadPagesWebCrawl() + { + $start_time = microtime(); + $can_schedule_again = false; + if(count($this->to_crawl) > 0) { + $can_schedule_again = true; + } + $sites = $this->getFetchSites(); + + if(!$sites) { + crawlLog("No seeds to fetch..."); + sleep(max(0, ceil( + MINIMUM_FETCH_LOOP_TIME - changeInMicrotime($start_time)))); + return array(); + } + + $site_pages = FetchUrl::getPages($sites, true); + + list($downloaded_pages, $schedule_again_pages) = + $this->reschedulePages($site_pages); + + if($can_schedule_again == true) { + //only schedule to crawl again on fail sites without crawl-delay + foreach($schedule_again_pages as $schedule_again_page) { + if($schedule_again_page[self::CRAWL_DELAY] == 0) { + $this->to_crawl_again[] = + array($schedule_again_page[self::URL], + $schedule_again_page[self::WEIGHT], + $schedule_again_page[self::CRAWL_DELAY] + ); + } + } + } + + return $downloaded_pages; + } + + /** + * Extracts NUM_MULTI_CURL_PAGES from the cureen Archive Bundle that is + * being recrawled. + * + * @return array an associative array of web pages and meta data from + * the archive bundle being iterated over + */ + function downloadPagesArchiveCrawl() + { + $base_name = CRAWL_DIR.'/cache/'.self::archive_base_name. + $this->crawl_index; + $pages = array(); + if(!isset($this->archive_iterator->iterate_timestamp) || + $this->archive_iterator->iterate_timestamp != $this->crawl_index || + $this->archive_iterator->result_timestamp != $this->crawl_time) { + if(!file_exists($base_name)){ + crawlLog("Recrawl archive with timestamp" . + " {$this->crawl_index} does not exist!"); + return $pages; + } else { + if(file_exists("$base_name/arc_type.txt")) { + $arctype = trim(file_get_contents( + "$base_name/arc_type.txt")); + } else { + $arctype = "WebArchiveBundle"; + } + $iterator_name = $arctype."Iterator"; + $this->archive_iterator = + new $iterator_name($this->crawl_index, $this->crawl_time); + if($this->archive_iterator == NULL) { + crawlLog("Error creating archive iterator!!"); + return $pages; + } + } + } + if(!$this->archive_iterator->end_of_iterator) { + $pages = $this->archive_iterator->nextPages(NUM_MULTI_CURL_PAGES); + } + return $pages; + } + /** * Deletes any crawl web archive bundles not in the provided array of crawls * @@ -437,8 +525,9 @@ class Fetcher implements CrawlConstants function checkScheduler() { $info = array(); - - if(count($this->to_crawl) > 0 || count($this->to_crawl_again) > 0) { + if((count($this->to_crawl) > 0 || count($this->to_crawl_again) > 0) && + (!isset($this->archive_iterator->end_of_iterator) || + !$this->archive_iterator->end_of_iterator)) { $info[self::STATUS] = self::CONTINUE_STATE; return; } @@ -456,6 +545,12 @@ class Fetcher implements CrawlConstants $tok = strtok($info_string, "\n"); $info = unserialize(base64_decode($tok)); + if(isset($info[self::CRAWL_TYPE])) { + $this->crawl_type = $info[self::CRAWL_TYPE]; + } + if(isset($info[self::CRAWL_INDEX])) { + $this->crawl_index = $info[self::CRAWL_INDEX]; + } if(isset($info[self::CRAWL_ORDER])) { $this->crawl_order = $info[self::CRAWL_ORDER]; } @@ -618,7 +713,7 @@ class Fetcher implements CrawlConstants $num_items = $this->web_archive->count; $i = 0; - + foreach($site_pages as $site) { $response_code = $site[self::HTTP_CODE]; @@ -647,6 +742,13 @@ class Fetcher implements CrawlConstants if(isset($PAGE_PROCESSORS[$type])) { $page_processor = $PAGE_PROCESSORS[$type]; + if($page_processor == "TextProcessor" || + get_parent_class($page_processor) == "TextProcessor") { + $text_data =true; + } else { + $text_data =false; + } + } else { continue; } @@ -656,6 +758,12 @@ class Fetcher implements CrawlConstants $doc_info = $processor->process($site[self::PAGE], $site[self::URL]); + $stored_fields = array(self::URL, self::HEADER, self::PAGE); + $summary_fields = array(self::IP_ADDRESSES, self::WEIGHT, + self::TIMESTAMP, self::TYPE, self::ENCODING, self::HTTP_CODE, + self::HASH, self::SERVER, self::SERVER_VERSION, + self::OPERATING_SYSTEM, self::MODIFIED); + if($doc_info) { $site[self::DOC_INFO] = $doc_info; @@ -666,37 +774,31 @@ class Fetcher implements CrawlConstants } - if($site[self::TYPE] != "text/html" ) { + if($text_data) { if(isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } else { $site[self::PAGE] = NULL; } - + } + if(!isset($site[self::ENCODING])) { + $site[self::ENCODING] = "UTF-8"; } - - $stored_site_pages[$i][self::URL] = $site[self::URL]; - $stored_site_pages[$i][self::IP_ADDRESSES] = - $site[self::IP_ADDRESSES]; - $stored_site_pages[$i][self::TIMESTAMP] = - $site[self::TIMESTAMP]; - $stored_site_pages[$i][self::TYPE] = $site[self::TYPE]; - if(isset($site[self::ENCODING])) { - $encoding = $site[self::ENCODING]; - } else { - $encoding = "UTF-8"; + foreach($summary_fields as $field) { + if(isset($site[$field])) { + $stored_site_pages[$i][$field] = $site[$field]; + $summarized_site_pages[$i][$field] = $site[$field]; + } + } + foreach($stored_fields as $field) { + if(isset($site[$field])) { + $stored_site_pages[$i][$field] = $site[$field]; + } } - $stored_site_pages[$i][self::ENCODING] = $encoding; - $stored_site_pages[$i][self::HTTP_CODE] = - $site[self::HTTP_CODE]; - $stored_site_pages[$i][self::HASH] = $site[self::HASH]; - $stored_site_pages[$i][self::PAGE] = $site[self::PAGE]; $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]); - $summarized_site_pages[$i][self::IP_ADDRESSES] = - $site[self::IP_ADDRESSES]; $summarized_site_pages[$i][self::TITLE] = strip_tags( $site[self::DOC_INFO][self::TITLE]); // stripping html to be on the safe side @@ -705,15 +807,10 @@ class Fetcher implements CrawlConstants if(isset($site[self::DOC_INFO][self::JUST_METAS])) { $summarized_site_pages[$i][self::JUST_METAS] = true; } - $summarized_site_pages[$i][self::TIMESTAMP] = - $site[self::TIMESTAMP]; - $summarized_site_pages[$i][self::ENCODING] = $encoding; - $summarized_site_pages[$i][self::HASH] = $site[self::HASH]; - $summarized_site_pages[$i][self::TYPE] = $site[self::TYPE]; - $summarized_site_pages[$i][self::HTTP_CODE] = - $site[self::HTTP_CODE]; - $summarized_site_pages[$i][self::WEIGHT] = $site[self::WEIGHT]; - + if(isset($site[self::DOC_INFO][self::LANG])) { + $summarized_site_pages[$i][self::LANG] = + $site[self::DOC_INFO][self::LANG]; + } if(isset($site[self::DOC_INFO][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS]; @@ -847,7 +944,8 @@ class Fetcher implements CrawlConstants $this->found_sites[self::ROBOT_TXT][$host][ self::CRAWL_DELAY] = $site[self::CRAWL_DELAY]; } - if(isset($site[self::LINKS])) { + if(isset($site[self::LINKS]) + && $this->crawl_type == self::WEB_CRAWL) { $num_links = count($site[self::LINKS]); //robots pages might have sitemaps links on them $this->addToCrawlSites($site[self::LINKS], @@ -855,7 +953,8 @@ class Fetcher implements CrawlConstants } } else { $this->found_sites[self::SEEN_URLS][] = $site; - if(isset($site[self::LINKS])) { + if(isset($site[self::LINKS]) + && $this->crawl_type == self::WEB_CRAWL) { if(!isset($this->found_sites[self::TO_CRAWL])) { $this->found_sites[self::TO_CRAWL] = array(); } @@ -878,7 +977,9 @@ class Fetcher implements CrawlConstants if((count($this->to_crawl) <= 0 && count($this->to_crawl_again) <= 0) || ( isset($this->found_sites[self::SEEN_URLS]) && count($this->found_sites[self::SEEN_URLS]) > - SEEN_URLS_BEFORE_UPDATE_SCHEDULER)) { + SEEN_URLS_BEFORE_UPDATE_SCHEDULER) || + ($this->crawl_type == self::ARCHIVE_CRAWL && + $this->archive_iterator->end_of_iterator)) { $this->updateScheduler(); } @@ -1095,11 +1196,18 @@ class Fetcher implements CrawlConstants */ if(!isset($site[self::JUST_METAS])) { $phrase_string = - mb_ereg_replace("[[:punct:]]", " ", $site[self::TITLE] . - " ". $site[self::DESCRIPTION]); + mb_ereg_replace(PUNCT, " ", $site[self::TITLE] . + " ". $site[self::DESCRIPTION]); + if(isset($site[self::LANG])) { + $lang = $site[self::LANG]; + } else { + $lang = NULL; + } $word_counts = - PhraseParser::extractPhrasesAndCount($phrase_string); + PhraseParser::extractPhrasesAndCount($phrase_string, + MAX_PHRASE_LEN, $lang); } + $meta_ids = array(); /* @@ -1124,7 +1232,35 @@ class Fetcher implements CrawlConstants if(strlen($url_type) > 0) { $meta_ids[] = 'filetype:'.$url_type; } - + if(isset($site[self::SERVER])) { + $meta_ids[] = 'server:'.strtolower($site[self::SERVER]); + } + if(isset($site[self::SERVER_VERSION])) { + $meta_ids[] = 'version:'. + $site[self::SERVER_VERSION]; + } + if(isset($site[self::OPERATING_SYSTEM])) { + $meta_ids[] = 'os:'.strtolower($site[self::OPERATING_SYSTEM]); + } + if(isset($site[self::MODIFIED])) { + $modified = $site[self::MODIFIED]; + $meta_ids[] = 'modified:'.date('Y', $modified); + $meta_ids[] = 'modified:'.date('Y-m', $modified); + $meta_ids[] = 'modified:'.date('Y-m-d', $modified); + } + if(isset($site[self::TIMESTAMP])) { + $date = $site[self::TIMESTAMP]; + $meta_ids[] = 'date:'.date('Y', $date); + $meta_ids[] = 'date:'.date('Y-m', $date); + $meta_ids[] = 'date:'.date('Y-m-d', $date); + } + if(isset($site[self::LANG])) { + $lang_parts = explode("-", $site[self::LANG]); + $meta_ids[] = 'lang:'.$lang_parts[0]; + if(isset($lang_parts[1])){ + $meta_ids[] = 'lang:'.$site[self::LANG]; + } + } // handles user added meta words if(isset($this->meta_words)) { $matches = array(); @@ -1151,7 +1287,8 @@ class Fetcher implements CrawlConstants //store inlinks so they can be searched by $num_links = count($site[self::LINKS]); if($num_links > 0) { - $link_weight = $site[self::WEIGHT]/$num_links; + $weight = (isset($site[self::WEIGHT])) ? $site[self::WEIGHT] :1; + $link_weight = $weight/$num_links; } else { $link_weight = 0; } diff --git a/bin/queue_server.php b/bin/queue_server.php index d9aaf65cf..1bcb5ca48 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -116,6 +116,20 @@ class QueueServer implements CrawlConstants * @var string */ var $crawl_order; + /** + * Indicates the kind of crawl being performed: self::WEB_CRAWL indicates + * a new crawl of the web; self::ARCHIVE_CRAWL indicates a crawl of an + * existing web archive + * @var string + */ + var $crawl_type; + + /** + * If the crawl_type is self::ARCHIVE_CRAWL, then crawl_index is the + * timestamp of the existing archive to crawl + * @var string + */ + var $crawl_index; /** * Says whether the $allowed_sites array is being used or not * @var bool @@ -244,7 +258,6 @@ class QueueServer implements CrawlConstants //check for orphaned queue bundles $this->deleteOrphanedBundles(); - $count = $this->web_queue->to_crawl_queue->count; $this->processIndexData(); if(time() - $this->last_index_save_time > FORCE_SAVE_TIME){ @@ -254,28 +267,40 @@ class QueueServer implements CrawlConstants crawlLog("... Save time".(changeInMicrotime($start_time))); } - $this->processRobotUrls(); + switch($this->crawl_type) + { + case self::WEB_CRAWL: + $this->processRobotUrls(); - if($count < NUM_URLS_QUEUE_RAM - - SEEN_URLS_BEFORE_UPDATE_SCHEDULER * MAX_LINKS_PER_PAGE) { - $info = $this->processQueueUrls(); - } + $count = $this->web_queue->to_crawl_queue->count; - if($count > 0) { - $top = $this->web_queue->peekQueue(); - if($top[1] < MIN_QUEUE_WEIGHT) { - crawlLog("Normalizing Weights!!\n"); - $this->web_queue->normalize(); - /* this will undercount the weights of URLS - from fetcher data that have not completed - */ - } - - if(!file_exists(CRAWL_DIR."/schedules/schedule.txt")) { - $this->produceFetchBatch(); - } - } + if($count < NUM_URLS_QUEUE_RAM - + SEEN_URLS_BEFORE_UPDATE_SCHEDULER * MAX_LINKS_PER_PAGE) { + $info = $this->processQueueUrls(); + } + + if($count > 0) { + $top = $this->web_queue->peekQueue(); + if($top[1] < MIN_QUEUE_WEIGHT) { + crawlLog("Normalizing Weights!!\n"); + $this->web_queue->normalize(); + /* this will undercount the weights of URLS + from fetcher data that have not completed + */ + } + if(!file_exists(CRAWL_DIR."/schedules/schedule.txt")) { + $this->produceFetchBatch(); + } + } + break; + case self::ARCHIVE_CRAWL: + $this->processRecrawlRobotUrls(); + if(!file_exists(CRAWL_DIR."/schedules/schedule.txt")) { + $this->writeArchiveCrawlInfo(); + } + break; + } crawlLog("Taking five second sleep..."); sleep(5); } @@ -283,6 +308,77 @@ class QueueServer implements CrawlConstants crawlLog("Queue Server shutting down!!"); } + /** + * + */ + function writeArchiveCrawlInfo() + { + $schedule_time = time(); + $first_line = $this->calculateScheduleMetaInfo($schedule_time); + $fh = fopen(CRAWL_DIR."/schedules/schedule.txt", "wb"); + fwrite($fh, $first_line); + fclose($fh); + + $schedule_dir = + CRAWL_DIR."/schedules/". + self::schedule_data_base_name.$this->crawl_time; + $this->processDataFile($schedule_dir, "processRecrawlDataArchive"); + + } + + function processRecrawlRobotUrls() + { + crawlLog("Checking for robots.txt files to process..."); + $robot_dir = + CRAWL_DIR."/schedules/". + self::robot_data_base_name.$this->crawl_time; + + $this->processDataFile($robot_dir, "processRecrawlRobotArchive"); + crawlLog("done. "); + } + + function processRecrawlRobotArchive($file) + { + crawlLog("Deleting unneeded robot schedule files"); + + unlink($file); + } + + /** + * + */ + function &getDataArchiveFileData($file) + { + crawlLog("Processing File: $file"); + + $fh = fopen($file, "rb"); + $machine_string = fgets($fh); + $len = strlen($machine_string); + if($len > 0) { + $machine_info = unserialize(base64_decode($machine_string)); + } + $sites = unserialize(gzuncompress(base64_decode( + urldecode(fread($fh, filesize($file) - $len)) + ))); + fclose($fh); + + if(isset($machine_info[self::MACHINE])) { + $this->most_recent_fetcher = & $machine_info[self::MACHINE]; + unset($machine_info); + } + return $sites; + } + /** + * + */ + function processRecrawlDataArchive($file) + { + $sites = & $this->getDataArchiveFileData($file); + unlink($file); + $this->writeCrawlStatus($sites); + } + + /** * Handles messages passed via files to the QueueServer. * @@ -307,6 +403,12 @@ class QueueServer implements CrawlConstants $this->startCrawl($info); crawlLog( "Starting new crawl. Timestamp:".$this->crawl_time); + if($this->crawl_type == self::WEB_CRAWL) { + crawlLog("Performing a web crawl!"); + } else { + crawlLog("Performing an archive crawl of ". + "archive with timestamp ".$this->crawl_index); + } break; case "STOP_CRAWL": @@ -367,10 +469,10 @@ class QueueServer implements CrawlConstants */ function indexSave() { + $this->last_index_save_time = time(); if(isset($this->index_archive) && $this->index_dirty) { $this->index_archive->forceSave(); $this->index_dirty = false; - $this->last_index_save_time = time(); // chmod so apache can also write to these directories $this->db->setWorldPermissionsRecursive( CRAWL_DIR.'/cache/'. @@ -392,6 +494,8 @@ class QueueServer implements CrawlConstants $read_from_info = array( "crawl_order" => self::CRAWL_ORDER, + "crawl_type" => self::CRAWL_TYPE, + "crawl_index" => self::CRAWL_INDEX, "restrict_sites_by_url" => self::RESTRICT_SITES_BY_URL, "allowed_sites" => self::ALLOWED_SITES, "disallowed_sites" => self::DISALLOWED_SITES, @@ -423,10 +527,13 @@ class QueueServer implements CrawlConstants $this->index_archive = NULL; gc_collect_cycles(); // garbage collect old crawls - $this->web_queue = new WebQueueBundle( - CRAWL_DIR.'/cache/'.self::queue_base_name. - $this->crawl_time, URL_FILTER_SIZE, - NUM_URLS_QUEUE_RAM, $min_or_max); + + if($this->crawl_type == self::WEB_CRAWL) { + $this->web_queue = new WebQueueBundle( + CRAWL_DIR.'/cache/'.self::queue_base_name. + $this->crawl_time, URL_FILTER_SIZE, + NUM_URLS_QUEUE_RAM, $min_or_max); + } if(!file_exists( CRAWL_DIR.'/cache/'.self::index_data_base_name.$this->crawl_time)) { @@ -454,8 +561,10 @@ class QueueServer implements CrawlConstants } // chmod so web server can also write to these directories - $this->db->setWorldPermissionsRecursive( - CRAWL_DIR.'/cache/'.self::queue_base_name.$this->crawl_time); + if($this->crawl_type == self::WEB_CRAWL) { + $this->db->setWorldPermissionsRecursive( + CRAWL_DIR.'/cache/'.self::queue_base_name.$this->crawl_time); + } $this->db->setWorldPermissionsRecursive( CRAWL_DIR.'/cache/'.self::index_data_base_name.$this->crawl_time); // initialize, store the description of this crawl in the index archive @@ -793,23 +902,7 @@ class QueueServer implements CrawlConstants */ function processDataArchive($file) { - crawlLog("Processing File: $file"); - - $fh = fopen($file, "rb"); - $machine_string = fgets($fh); - $len = strlen($machine_string); - if($len > 0) { - $machine_info = unserialize(base64_decode($machine_string)); - } - $sites = unserialize(gzuncompress(base64_decode( - urldecode(fread($fh, filesize($file) - $len)) - ))); - fclose($fh); - - if(isset($machine_info[self::MACHINE])) { - $this->most_recent_fetcher = & $machine_info[self::MACHINE]; - unset($machine_info); - } + $sites = & $this->getDataArchiveFileData($file); crawlLog("...Updating Delayed Hosts Array ..."); $start_time = microtime(); @@ -905,6 +998,15 @@ class QueueServer implements CrawlConstants unlink($file); + + $this->writeCrawlStatus($sites); + } + + /** + * + */ + function writeCrawlStatus(&$sites) + { $crawl_status = array(); $stat_file = CRAWL_DIR."/schedules/crawl_status.txt"; if(file_exists($stat_file) ) { @@ -939,7 +1041,6 @@ class QueueServer implements CrawlConstants crawlLog("URL: $url"); } } - } /** @@ -952,6 +1053,24 @@ class QueueServer implements CrawlConstants $this->web_queue->differenceSeenUrls($sites, 0); } + /** + * + */ + function calculateScheduleMetaInfo($schedule_time) + { + $sites = array(); + $sites[self::CRAWL_TIME] = $this->crawl_time; + $sites[self::SCHEDULE_TIME] = $schedule_time; + $sites[self::SAVED_CRAWL_TIMES] = $this->getCrawlTimes(); + // fetcher should delete any crawl time not listed here + $sites[self::CRAWL_ORDER] = $this->crawl_order; + $sites[self::CRAWL_TYPE] = $this->crawl_type; + $sites[self::CRAWL_INDEX] = $this->crawl_index; + $sites[self::META_WORDS] = $this->meta_words; + $sites[self::SITES] = array(); + + return base64_encode(serialize($sites))."\n"; + } /** * Produces a schedule.txt file of url data for a fetcher to crawl next. @@ -976,16 +1095,10 @@ class QueueServer implements CrawlConstants $count = $this->web_queue->to_crawl_queue->count; - $sites = array(); - $sites[self::CRAWL_TIME] = $this->crawl_time; - $sites[self::SCHEDULE_TIME] = time(); - $sites[self::SAVED_CRAWL_TIMES] = $this->getCrawlTimes(); - // fetcher should delete any crawl time not listed here - $sites[self::CRAWL_ORDER] = $this->crawl_order; - $sites[self::SITES] = array(); - $sites[self::META_WORDS] = $this->meta_words; - $first_line = base64_encode(serialize($sites))."\n"; + $schedule_time = time(); + $first_line = $this->calculateScheduleMetaInfo($schedule_time); + $sites = array(); $delete_urls = array(); $crawl_delay_hosts = array(); @@ -1017,10 +1130,10 @@ class QueueServer implements CrawlConstants } else { $next_slot = $this->getEarliestSlot($current_crawl_index, - $sites[self::SITES]); + $sites); if($next_slot < MAX_FETCH_SIZE) { - $sites[self::SITES][$next_slot] = + $sites[$next_slot] = array($url, $weight, 0); $delete_urls[$i] = $url; /* note don't add to seen url filter @@ -1069,11 +1182,11 @@ class QueueServer implements CrawlConstants && $num_waiting < MAX_WAITING_HOSTS) || (isset($this->waiting_hosts[crawlHash($host_url)]) && $this->waiting_hosts[crawlHash($host_url) ] == - $sites[self::SCHEDULE_TIME])) { + $schedule_time)) { $this->waiting_hosts[crawlHash($host_url)] = - $sites[self::SCHEDULE_TIME]; - $this->waiting_hosts[$sites[self::SCHEDULE_TIME]][] = + $schedule_time; + $this->waiting_hosts[$schedule_time][] = crawlHash($host_url); $request_batches_per_delay = ceil($delay/$time_per_request_guess); @@ -1089,9 +1202,9 @@ class QueueServer implements CrawlConstants if(($next_slot = $this->getEarliestSlot( $next_earliest_slot, - $sites[self::SITES])) < MAX_FETCH_SIZE) { + $sites)) < MAX_FETCH_SIZE) { $crawl_delay_hosts[$host_url] = $next_slot; - $sites[self::SITES][$next_slot] = + $sites[$next_slot] = array($url, $weight, $delay); $delete_urls[$i] = $url; $this->web_queue->addSeenUrlFilter($url); @@ -1104,9 +1217,9 @@ class QueueServer implements CrawlConstants } } else { // add a url no crawl delay $next_slot = $this->getEarliestSlot( - $current_crawl_index, $sites[self::SITES]); + $current_crawl_index, $sites); if($next_slot < MAX_FETCH_SIZE) { - $sites[self::SITES][$next_slot] = + $sites[$next_slot] = array($url, $weight, 0); $delete_urls[$i] = $url; $this->web_queue->addSeenUrlFilter($url); @@ -1133,28 +1246,28 @@ class QueueServer implements CrawlConstants $this->web_queue->removeQueue($delete_url); } - if(isset($sites[self::SITES]) && count($sites[self::SITES]) > 0 ) { + if(isset($sites) && count($sites) > 0 ) { $dummy_slot = array(self::DUMMY, 0.0, 0); /* dummy's are used for crawl delays of sites with longer delays when we don't have much else to crawl */ $cnt = 0; for($j = 0; $j < MAX_FETCH_SIZE; $j++) { - if(isset( $sites[self::SITES][$j])) { + if(isset( $sites[$j])) { $cnt++; if($cnt == $fetch_size) {break; } } else { if($j % NUM_MULTI_CURL_PAGES == 0) { - $sites[self::SITES][$j] = $dummy_slot; + $sites[$j] = $dummy_slot; } } } - ksort($sites[self::SITES]); + ksort($sites); //write schedule to disk $fh = fopen(CRAWL_DIR."/schedules/schedule.txt", "wb"); fwrite($fh, $first_line); - foreach($sites[self::SITES] as $site) { + foreach($sites as $site) { list($url, $weight, $delay) = $site; $out_string = base64_encode( packFloat($weight).packInt($delay).$url)."\n"; @@ -1231,7 +1344,7 @@ class QueueServer implements CrawlConstants } /** - * Checks if the url belongs to one of the sites list in site_array + * Checks if the url belongs to one of the sites listed in site_array * Sites can be either given in the form domain:host or * in the form of a url in which case it is check that the site url * is a substring of the passed url. diff --git a/configs/config.php b/configs/config.php index 4567d0ac7..2bad1a1cd 100755 --- a/configs/config.php +++ b/configs/config.php @@ -162,7 +162,6 @@ define('MAXIMUM_CRAWL_DELAY', 64); /** maximum number of active crawl-delayed hosts */ define('MAX_WAITING_HOSTS', 1000); - /** * bloom filters are used to keep track of which urls are visited, @@ -262,6 +261,7 @@ $PAGE_PROCESSORS = array( "text/html" => "HtmlProcessor", "text/asp" => "HtmlProcessor", "text/xml" => "XmlProcessor", + "application/xml" => "XmlProcessor", "application/xhtml+xml" => "HtmlProcessor", "application/rss+xml" => "RssProcessor", @@ -283,7 +283,8 @@ $PAGE_PROCESSORS = array( "text/html" => "HtmlProcessor", "image/svg+xml"=> "SvgProcessor" ); - +/** Characters we view as not part of words, not same as POSIX [:punct:]*/ +define ('PUNCT', "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\||\&"); /** * How many non robot urls the fetcher successfully downloads before diff --git a/configs/default_crawl.ini b/configs/default_crawl.ini index bb1577a8e..b428899bf 100644 --- a/configs/default_crawl.ini +++ b/configs/default_crawl.ini @@ -24,6 +24,7 @@ ; [general] crawl_order = 'ad'; +crawl_type = 'ax'; restrict_sites_by_url = false; [allowed_sites] diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php index 9b912ea3b..45a36e4ad 100755 --- a/controllers/admin_controller.php +++ b/controllers/admin_controller.php @@ -250,7 +250,7 @@ class AdminController extends Controller implements CrawlConstants $data = array_merge($data, $crawl_status); } } - $data['RECENT_CRAWLS'] = $this->crawlModel->getCrawlList(); + $data['RECENT_CRAWLS'] = $this->crawlModel->getCrawlList(false, true); if(isset($data['CRAWL_TIME'])) { //erase from previous crawl list any active crawl $num_crawls = count($data['RECENT_CRAWLS']); @@ -687,6 +687,12 @@ class AdminController extends Controller implements CrawlConstants $info[self::STATUS] = "NEW_CRAWL"; $info[self::CRAWL_TIME] = time(); $seed_info = $this->crawlModel->getSeedInfo(); + $info[self::CRAWL_TYPE] = + $seed_info['general']['crawl_type']; + $info[self::CRAWL_INDEX] = + (isset($seed_info['general']['crawl_index'])) ? + $seed_info['general']['crawl_index'] : + ''; $info[self::TO_CRAWL] = $seed_info['seed_sites']['url']; $info[self::CRAWL_ORDER] = @@ -789,15 +795,22 @@ class AdminController extends Controller implements CrawlConstants (getLocaleDirection() == 'ltr') ? "right": "left"; $data["ELEMENT"] = "crawloptionsElement"; $crawls = $this->crawlModel->getCrawlList(); + $indexes = $this->crawlModel->getCrawlList(true, true); $update_flag = false; $data['available_options'] = array( tl('admin_controller_use_below'), tl('admin_controller_use_defaults')); + $data['available_crawl_indexes'] = array(); $data['options_default'] = tl('admin_controller_use_below'); foreach($crawls as $crawl) { $data['available_options'][$crawl['CRAWL_TIME']] = tl('admin_controller_previous_crawl')." ". $crawl['DESCRIPTION']; + + } + foreach($indexes as $crawl) { + $data['available_crawl_indexes'][$crawl['CRAWL_TIME']] + = $crawl['DESCRIPTION']; } $no_further_changes = false; if(isset($_REQUEST['load_option']) && @@ -816,6 +829,34 @@ class AdminController extends Controller implements CrawlConstants } else { $seed_info = $this->crawlModel->getSeedInfo(); } + if(!$no_further_changes && isset($_REQUEST['crawl_indexes']) + && in_array($_REQUEST['crawl_indexes'], + array_keys($data['available_crawl_indexes']))) { + $seed_info['general']['crawl_index'] = + $_REQUEST['crawl_indexes']; + $update_flag = true; + } + $data['crawl_index'] = + (isset($seed_info['general']['crawl_index'])) ? + $seed_info['general']['crawl_index'] : ''; + $data['available_crawl_types'] = array(self::WEB_CRAWL, + self::ARCHIVE_CRAWL); + if(!$no_further_changes && isset($_REQUEST['crawl_type']) + && in_array($_REQUEST['crawl_type'], + $data['available_crawl_types'])) { + $seed_info['general']['crawl_type'] = + $_REQUEST['crawl_type']; + $update_flag = true; + } + $data['crawl_type'] = $seed_info['general']['crawl_type']; + if($data['crawl_type'] == self::WEB_CRAWL) { + $data['web_crawl_active'] = "active"; + $data['archive_crawl_active'] = ""; + } else { + $data['archive_crawl_active'] = "active"; + $data['web_crawl_active'] = ""; + } + $data['available_crawl_orders'] = array( self::BREADTH_FIRST => tl('admin_controller_breadth_first'), @@ -879,7 +920,13 @@ class AdminController extends Controller implements CrawlConstants " elt('load-options').onchange = ". "function() { if(elt('load-options').selectedIndex !=". " 0) { elt('crawloptionsForm').submit(); }};"; - + if($data['crawl_type'] == CrawlConstants::WEB_CRAWL) { + $data['SCRIPT'] .= + "switchTab('webcrawltab', 'archivetab');"; + } else { + $data['SCRIPT'] .= + "switchTab('archivetab', 'webcrawltab');"; + } if($update_flag) { $this->crawlModel->setSeedInfo($seed_info); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >". diff --git a/controllers/search_controller.php b/controllers/search_controller.php index cc16f5ff0..79b8cc8ee 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -367,39 +367,42 @@ class SearchController extends Controller implements CrawlConstants exit(); } - $machine = $crawl_item[self::MACHINE]; $machine_uri = $crawl_item[self::MACHINE_URI]; $page = $crawl_item[self::HASH]; $offset = $crawl_item[self::OFFSET]; $cache_item = $this->crawlModel->getCacheFile($machine, $machine_uri, $generation, $offset, $crawl_time); - $cache_file = $cache_item[self::PAGE]; - $request = $cache_item['REQUEST']; - - $meta_words = array('link\:', 'site\:', - 'filetype\:', 'info\:', '\-', - 'index:', 'ip:', 'i:', 'weight:', 'w:', 'u:'); - foreach($meta_words as $meta_word) { - $pattern = "/(\s)($meta_word(\S)+)/"; - $query = preg_replace($pattern, "", $query); - } - $query = str_replace("'", " ", $query); - $query = str_replace('"', " ", $query); - $query = str_replace('\\', " ", $query); - $query = str_replace('|', " ", $query); - $query = $this->clean($query, "string"); - - $page_url = $url; + if(!stristr($cache_item[self::TYPE], "image")) { - $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $query); - $words = mb_split(" ",$phrase_string); - if(!$highlight) { + $meta_words = array('link\:', 'site\:', 'version\:', 'modified\:', + 'filetype\:', 'info\:', '\-', 'os\:', 'server\:', 'date\:', + 'lang\:', + 'index:', 'ip:', 'i:', 'weight:', 'w:', 'u:'); + foreach($meta_words as $meta_word) { + $pattern = "/(\s)($meta_word(\S)+)/"; + $query = preg_replace($pattern, "", $query); + } + $query = str_replace("'", " ", $query); + $query = str_replace('"', " ", $query); + $query = str_replace('\\', " ", $query); + $query = str_replace('|', " ", $query); + $query = $this->clean($query, "string"); + + $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $query); + $words = mb_split(" ",$phrase_string); + if(!$highlight) { + $words = array(); + } + } else { + $type = $cache_item[self::TYPE]; + $cache_file = "<html><head><title>Yioop! Cache</title></head>". + "<body><object data='data:$type;base64,". + base64_encode($cache_file)."' type='$type' /></body></html>"; $words = array(); } - $date = date ("F d Y H:i:s", $cache_item[self::TIMESTAMP]); $dom = new DOMDocument(); @@ -418,17 +421,22 @@ class SearchController extends Controller implements CrawlConstants $body = $dom->getElementsByTagName('body')->item(0); } $first_child = $body->firstChild; - + $preNode = $dom->createElement('pre'); + $preNode = $body->insertBefore($preNode, $first_child); $divNode = $dom->createElement('div'); - $divNode = $body->insertBefore($divNode, $first_child); + $divNode = $body->insertBefore($divNode, $preNode); $divNode->setAttributeNS("","style", "border-color: black; ". "border-style:solid; border-width:3px; ". "padding: 5px; background-color: white"); $textNode = $dom->createTextNode(tl('search_controller_cached_version', - "$page_url", $date)); - $textNode = $divNode->appendChild($textNode); + "$url", $date)); + $divNode->appendChild($textNode); + if(isset($cache_item[self::HEADER])) { + $textNode = $dom->createTextNode($cache_item[self::HEADER]); + $preNode->appendChild($textNode); + } $body = $this->markChildren($body, $words, $dom); $newDoc = $dom->saveHTML(); diff --git a/controllers/settings_controller.php b/controllers/settings_controller.php index 5456248b0..b476c0133 100755 --- a/controllers/settings_controller.php +++ b/controllers/settings_controller.php @@ -111,7 +111,7 @@ class SettingsController extends Controller $data['PER_PAGE_SELECTED'] = NUM_RESULTS_PER_PAGE; } - $crawls = $this->crawlModel->getCrawlList(); + $crawls = $this->crawlModel->getCrawlList(false, true); $data['CRAWLS'] = array(); foreach($crawls as $crawl) { $data['CRAWLS'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION']. diff --git a/css/search.css b/css/search.css index 9fbf45143..ef537c8e0 100755 --- a/css/search.css +++ b/css/search.css @@ -614,6 +614,44 @@ p } +.tabmenu-list +{ + border-bottom: 2px solid black; + padding: 0px; + margin-bottom: 0px; + z-index: 1; +} +.tabmenu-list li +{ + display: inline; + list-style-type: none; +} + +.tabmenu-list a +{ + border: 1px solid black; + border-bottom: 0px; + padding: 5px 5px 0px 5px; + background-color:#EEE; + overflow: hidden; + margin: 0; + text-decoration: none; +} + +.tabmenu-list a.active +{ + background-color:white; + border-bottom: 3px solid white; +} + +.tabmenu-content +{ + border: 1px solid black; + border-top: none; + padding: 10px; + z-index: 2; +} + .crawlstable, .mixestable, .crawlstable th, .mixestable th, .crawlstable td, .mixestable td { diff --git a/index.php b/index.php index 8a31f6367..0a3911ae8 100755 --- a/index.php +++ b/index.php @@ -48,7 +48,6 @@ ini_set("memory_limit","500M"); header("X-FRAME-OPTIONS: DENY"); //prevent click jacking session_name(SESSION_NAME); session_start(); - /** * Sets up DB to be used */ diff --git a/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php b/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php new file mode 100644 index 000000000..28741d7eb --- /dev/null +++ b/lib/archive_bundle_iterators/arc_archive_bundle_iterator.php @@ -0,0 +1,201 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage iterator + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010, 2011 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + *Loads base class for iterating + */ +require_once BASE_DIR. + '/lib/archive_bundle_iterators/archive_bundle_iterator.php'; + +/** + * Used to iterate through the records of a collection of arc files stored in + * a WebArchiveBundle folder. Arc is the file format of the Internet Archive + * http://www.archive.org/web/researcher/ArcFileFormat.php. Iteration would be + * for the purpose making an index of these records + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage iterator + * @see WebArchiveBundle + */ +class ArcArchiveBundleIterator implements CrawlConstants +{ + /** + * The number of arc files in this arc archive bundle + * @var int + */ + var $num_partitions; + + /** + * Counting in glob order for this arc archive bundle directory, the + * current active file number of the arc file being process. + * + * @var int + */ + var $current_partition_num; + /** + current byte offset into the current arc file + * @var int + */ + var $current_offset; + /** + * Array of filenames of arc files in this directory (glob order) + * @var array + */ + var $partitions; + /** + * File handle for current arc file + * @var resource + */ + var $fh; + + /** + * Creates a arc archive iterator with the given parameters. + * + * @param string $iterate_timestamp timestamp of the arc archive bundle to + * iterate over the pages of + * @param string $result_timestamp timestamp of the arc archive bundle + * results are being stored in + */ + function __construct($iterate_timestamp, $result_timestamp) + { + $this->iterate_timestamp = $iterate_timestamp; + $this->result_timestamp = $result_timestamp; + $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. + $iterate_timestamp; + $this->partitions = array(); + foreach(glob("$archive_name/*.arc.gz") as $filename) { + $this->partitions[] = $filename; + } + $this->num_partitions = count($this->partitions); + + if(file_exists("$archive_name/iterate_status.txt")) { + $info = unserialize(file_get_contents( + "$archive_name/iterate_status.txt")); + $this->end_of_iterator = $info['end_of_iterator']; + $this->current_partition_num = $info['current_partition_num']; + $this->current_offset = $info['current_offset']; + } else { + $this->reset(); + } + + $this->fh=gzopen($this->partitions[$this->current_partition_num], "rb"); + gzseek($this->fh, $this->current_offset); + + } + + /** + * Resets the iterator to the start of the archive bundle + */ + function reset() + { + $this->current_partition_num = 0; + $this->end_of_iterator = false; + $this->current_offset = 0; + $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. + $this->result_timestamp; + @unlink("$archive_name/iterate_status.txt"); + } + + /** + * Gets the next $num many docs from the iterator + * @param int $num number of docs to get + * @return array associative arrays for $num pages + */ + function nextPages($num) + { + $pages = array(); + for($i = 0; $i < $num; $i++) { + $page = $this->nextPage(); + if(!$page) { + if(is_resource($this->fh)) { + gzclose($this->fh); + } + $this->current_partition_num++; + if($this->current_partition_num >= $this->num_partitions) { + $this->end_of_iterator = true; + break; + } + $this->fh = gzopen( + $this->partitions[$this->current_partition_num], "rb"); + } else { + $pages[] = $page; + } + } + if(is_resource($this->fh)) { + $this->current_offset = gztell($this->fh); + } + + $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. + $this->result_timestamp; + $info = array(); + $info['end_of_iterator'] = $this->end_of_iterator; + $info['current_partition_num'] = $this->current_partition_num; + $info['current_offset'] = $this->current_offset; + file_put_contents("$archive_name/iterate_status.txt", + serialize($info)); + return $pages; + } + + + /** + * Gets the next doc from the iterator + * @return array associative array for doc + */ + function nextPage() + { + if(!is_resource($this->fh)) return NULL; + do { + if(!$page_info = gzgets($this->fh) ) return NULL; + $info_parts = explode(" ", $page_info); + $num_parts = count($info_parts); + $length = $info_parts[$num_parts - 1]; + + if(!$object = gzread($this->fh, $length + 1)) return NULL; + } while(substr($page_info, 0, 3) == 'dns'); //ignore dns entries in arc + $site = array(); + $site[self::URL] = $info_parts[0]; + $site[self::IP_ADDRESSES] = array($info_parts[1]); + $site[self::TIMESTAMP] = date("U", strtotime($info_parts[2])); + $site[self::TYPE] = $info_parts[3]; + $site_contents = FetchUrl::parseHeaderPage($object); + $site = array_merge($site, $site_contents); + $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); + $site[self::WEIGHT] = 1; + return $site; + } + +} +?> diff --git a/lib/archive_bundle_iterators/archive_bundle_iterator.php b/lib/archive_bundle_iterators/archive_bundle_iterator.php new file mode 100644 index 000000000..938efae4c --- /dev/null +++ b/lib/archive_bundle_iterators/archive_bundle_iterator.php @@ -0,0 +1,83 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage iterator + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010, 2011 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** Loads common constants for web crawling*/ +require_once BASE_DIR."/lib/crawl_constants.php"; + +/** + * Abstract class used to model iterating documents indexed in + * an WebArchiveBundle or set of such bundles. + * + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage iterator + * @see WebArchiveBundle + */ +abstract class ArchiveBundleIterator implements CrawlConstants +{ + + + /** + * Timestamp of the archive that is being iterated over + * @var int + */ + var $iterate_timestamp; + + /** + * Timestamp of the archive that is being used to store results in + * @var int + */ + var $result_timestamp; + + /** + * Whether or not the iterator still has more documents + * @var bool + */ + var $end_of_iterator; + + /** + * Gets the next $num many docs from the iterator + * @param int $num number of docs to get + * @return array associative arrays for $num pages + */ + abstract function nextPages($num); + + /** + * Resets the iterator to the start of the archive bundle + */ + abstract function reset(); +} +?> diff --git a/lib/archive_bundle_iterators/web_archive_bundle_iterator.php b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php new file mode 100644 index 000000000..83861f537 --- /dev/null +++ b/lib/archive_bundle_iterators/web_archive_bundle_iterator.php @@ -0,0 +1,196 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage iterator + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010, 2011 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + *Loads base class for iterating + */ +require_once BASE_DIR. + '/lib/archive_bundle_iterators/archive_bundle_iterator.php'; + +/** + * Class used to model iterating documents indexed in + * an WebArchiveBundle. This would typically be for the purpose + * of re-indexing these documents. + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage iterator + * @see WebArchiveBundle + */ +class WebArchiveBundleIterator implements CrawlConstants +{ + + /** + * Number of web archive objects in this web archive bundle + * @var int + */ + var $num_partitions; + /** + * The current web archive in the bundle that is being iterated over + * @var int + */ + var $partition; + /** + * The item within the current partition to be returned next + * @var int + */ + var $partition_index; + /** + * Index of web archive in the web archive bundle that the iterator is + * currently getting results from + * @var int + */ + var $current_partition_num; + /** + * Index between 0 and $this->count of where the iterator is at + * @var int + */ + var $overall_index; + /** + * Number of documents in the web archive bundle being iterated over + * @var int + */ + var $count; + /** + * The web archive bundle being iterated over + * @var object + */ + var $archive; + + /** + * Creates a web archive iterator with the given parameters. + * + * @param string $iterate_timestamp timestamp of the web archive bundle to + * iterate over the pages of + * @param string $result_timestamp timestamp of the web archive bundle + * results are being stored in + */ + function __construct($iterate_timestamp, $result_timestamp) + { + $this->iterate_timestamp = $iterate_timestamp; + $this->result_timestamp = $result_timestamp; + $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. + $iterate_timestamp; + $this->archive = new WebArchiveBundle($archive_name); + $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. + $result_timestamp; + if(file_exists("$archive_name/iterate_status.txt")) { + $info = unserialize(file_get_contents( + "$archive_name/iterate_status.txt")); + $this->count = $this->archive->count; + $this->num_partitions = $this->archive->write_partition+1; + $this->overall_index = $info['overall_index']; + $this->end_of_iterator = $info['end_of_iterator']; + $this->partition_index = $info['partition_index']; + $this->current_partition_num = $info['current_partition_num']; + $this->partition = $this->archive->getPartition( + $this->current_partition_num, false); + $this->partition->iterator_pos = $info['iterator_pos']; + } else { + $this->reset(); + } + + } + + /** + * Gets the next $num many docs from the iterator + * + * @param int $num number of docs to get + * @return array associative arrays for $num pages + */ + function nextPages($num) + { + if($num + $this->overall_index >= $this->count) { + $num = max($this->count - $this->overall_index, 0); + } + $num_to_get = 1; + $objects = array(); + for($i = 0; $i < $num; $i += $num_to_get) { + $num_to_get = min($num, $this->partition->count - + $this->partition_index); + $pre_new_objects = $this->partition->nextObjects($num_to_get); + foreach($pre_new_objects as $object) { + $objects[] = $object[1]; + } + + $this->overall_index += $num_to_get; + $this->partition_index += $num_to_get; + if($num_to_get <= 0) { + $this->current_partition_num++; + $this->partition = $this->archive->getPartition( + $this->current_partition_num, false); + $this->partition_index = 0; + } + if($this->current_partition_num > $this->num_partitions) break; + } + $this->end_of_iterator = ($this->overall_index >= $this->count ) ? + true : false; + + $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. + $this->result_timestamp; + $info = array(); + $info['overall_index'] = $this->overall_index; + $info['end_of_iterator'] = $this->end_of_iterator; + $info['partition_index'] = $this->partition_index; + $info['current_partition_num'] = $this->current_partition_num; + $info['iterator_pos'] =$this->partition->iterator_pos; + file_put_contents("$archive_name/iterate_status.txt", + serialize($info)); + + return $objects; + } + + /** + * Resets the iterator to the start of the archive bundle + */ + function reset() + { + $this->count = $this->archive->count; + $this->num_partitions = $this->archive->write_partition+1; + $this->overall_index = 0; + $this->end_of_iterator = ($this->overall_index >= $this->count) ? + true : false; + $this->partition_index = 0; + $this->current_partition_num = 0; + $this->partition = $this->archive->getPartition( + $this->current_partition_num, false); + $this->partition->reset(); + $archive_name = CRAWL_DIR.'/cache/'.self::archive_base_name. + $this->result_timestamp; + @unlink("$archive_name/iterate_status.txt"); + } + +} +?> diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index dd82fb006..a668e37c7 100644 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -142,6 +142,16 @@ interface CrawlConstants const BOOST = 'av'; const IP_ADDRESSES = 'au'; const JUST_METAS = 'aw'; + const WEB_CRAWL = 'ax'; + const ARCHIVE_CRAWL = 'ay'; + const CRAWL_TYPE = 'az'; + const CRAWL_INDEX = 'ba'; + const HEADER = 'bb'; + const SERVER = 'bc'; + const SERVER_VERSION = 'bd'; + const OPERATING_SYSTEM = 'be'; + const MODIFIED = 'bf'; + const LANG = 'bg'; const NEEDS_OFFSET_FLAG = 0x7FFFFFFF; diff --git a/lib/fetch_url.php b/lib/fetch_url.php index 2493b18d2..e53fda044 100755 --- a/lib/fetch_url.php +++ b/lib/fetch_url.php @@ -91,6 +91,7 @@ class FetchUrl implements CrawlConstants curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true); curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT); curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT); + curl_setopt($sites[$i][0], CURLOPT_HEADER, true); curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, array('Range: bytes=0-'.PAGE_RANGE_REQUEST)); curl_multi_add_handle($agent_handler, $sites[$i][0]); @@ -128,9 +129,21 @@ class FetchUrl implements CrawlConstants $ip_addresses = self::getCurlIp($ip_holder[$i]); fclose($ip_holder[$i]); if($sites[$i][0]) { - // Get Data and Message Code $content = @curl_multi_getcontent($sites[$i][0]); + + if(isset($content)) { + $site = self::parseHeaderPage($content, $value); + $sites[$i] = array_merge($sites[$i], $site); + /* + Store Data into our $sites array, create a hash for + deduplication purposes + */ + $sites[$i][$hash] = + self::computePageHash($sites[$i][$value]); + + } + $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE); if(!$sites[$i][self::HTTP_CODE]) { @@ -141,35 +154,6 @@ class FetchUrl implements CrawlConstants } else { $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0"); } - /* - Store Data into our $sites array, create a hash for - deduplication purposes - */ - if(isset($content)) { - $sites[$i][$value] = - mb_substr($content, 0, PAGE_RANGE_REQUEST); - /* to do dedup we strip script, noscript, and style tags - as well as their content, then we strip tags, get rid - of whitespace and hash - */ - $strip_array = - array('@<script[^>]*?>.*?</script>@si', - '@<noscript[^>]*?>.*?</noscript>@si', - '@<style[^>]*?>.*?</style>@si'); - $dedup_string = preg_replace( - $strip_array, '', $sites[$i][$value]); - $dedup_string_old = preg_replace( - '/\W+/', '', $dedup_string); - $dedup_string = strip_tags($dedup_string_old); - if($dedup_string == "") { - $dedup_string = $dedup_string_old; - } - $dedup_string = preg_replace( - '/\W+/', '', $dedup_string); - - $sites[$i][$hash] = crawlHash($dedup_string, true); - - } //Get Time, Mime type and Character encoding $sites[$i][self::TIMESTAMP] = time(); @@ -207,6 +191,103 @@ class FetchUrl implements CrawlConstants return $sites; } + /** + * Computes a hash of a string containing page data for use in + * deduplication of pages with similar content + * + * @param string &$page web page data + * @return string 8 byte hash to identify page contents + */ + public static function computePageHash(&$page) + { + /* to do dedup we strip script, noscript, and style tags + as well as their content, then we strip tags, get rid + of whitespace and hash + */ + $strip_array = + array('@<script[^>]*?>.*?</script>@si', + '@<noscript[^>]*?>.*?</noscript>@si', + '@<style[^>]*?>.*?</style>@si'); + $dedup_string = preg_replace( + $strip_array, '', $page); + $dedup_string_old = preg_replace( + '/\W+/', '', $dedup_string); + $dedup_string = strip_tags($dedup_string_old); + if($dedup_string == "") { + $dedup_string = $dedup_string_old; + } + $dedup_string = preg_replace( + '/\W+/', '', $dedup_string); + + return crawlHash($dedup_string, true); + } + + /** + * Splits an http response document into the http headers sent + * and the web page returned. Parses out useful information from + * the header and return an array of these two parts and the useful info. + * + * @param string &$header_and_page + * @param string $value + * @return array info array consisting of a header, page for an http + * response, as well as parsed from the header the server, server + * version, operating system, encoding, and date information. + */ + public static function parseHeaderPage(&$header_and_page, + $value=CrawlConstants::PAGE) + { + $CRLFCRLF = strpos($header_and_page, "\x0D\x0A\x0D\x0A"); + $LFLF = strpos($header_and_page, "\x0A\x0A"); + + $header_offset = ($CRLFCRLF > 0) ? $CRLFCRLF : $LFLF; + //either two CRLF (what spec says) or two LF's to be safe + $site = array(); + $site[CrawlConstants::HEADER] = + substr($header_and_page, 0, $header_offset); + + $site[$value] = ltrim(substr($header_and_page, $header_offset)); + $lines = explode("\n", $site[CrawlConstants::HEADER]); + $first_line = array_shift($lines); + $response = preg_split("/(\s+)/", $first_line); + $site[CrawlConstants::HTTP_CODE] = @trim($response[1]); + foreach($lines as $line) { + $line = trim($line); + if(stristr($line, 'Server:')) { + $server_parts = explode("Server:", $line); + $server_name_parts = explode("/", $server_parts[1]); + $site[CrawlConstants::SERVER] = @trim($server_name_parts[0]); + if(isset($server_name_parts[1])) { + $version_parts = explode("(", $server_name_parts[1]); + $site[CrawlConstants::SERVER_VERSION] = + @trim($version_parts[0]); + if(isset($version_parts[1])) { + $os_parts = explode(")", $version_parts[1]); + $site[CrawlConstants::OPERATING_SYSTEM] = + @trim($os_parts[0]); + } + } + } + if(stristr($line, 'charset=')) { + $line_parts = explode("charset=", $line); + $site[CrawlConstants::ENCODING] = @trim($line_parts[1]); + } + if(stristr($line, 'Last-Modified:')) { + $line_parts = explode("Last-Modified:", $line); + $site[CrawlConstants::MODIFIED] = + strtotime(@trim($line_parts[1])); + } + + } + if(!isset($site[CrawlConstants::ENCODING]) ) { + $site[CrawlConstants::ENCODING] = + mb_detect_encoding($site[$value], 'auto'); + } + if(!isset($site[CrawlConstants::SERVER]) ) { + $site[CrawlConstants::SERVER] = "unknown"; + } + return $site; + } + /** * Computes the IP address from a file pointer assumed to be pointing * at STDERR output from a curl request @@ -242,7 +323,7 @@ class FetchUrl implements CrawlConstants curl_setopt($agent, CURLOPT_USERAGENT, USER_AGENT); curl_setopt($agent, CURLOPT_URL, $site); curl_setopt($agent, CURLOPT_AUTOREFERER, true); - curl_setopt($agent, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($agent, CURLOPT_FOLLOWLOCATION, true); curl_setopt($agent, CURLOPT_RETURNTRANSFER, true); curl_setopt($agent, CURLOPT_FAILONERROR, true); diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index b80186bfb..48498ce3b 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -143,7 +143,7 @@ class IndexArchiveBundle implements CrawlConstants * @param int $num_partitions_summaries number of WebArchive partitions * to use in the summmaries WebArchiveBundle * @param string $description a text name/serialized info about this - * IndexArchiveBundle + * IndexArchiveBundle */ public function __construct($dir_name, $read_only_archive = true, $description = NULL, $num_docs_per_generation = NUM_DOCS_PER_GENERATION) @@ -406,18 +406,34 @@ class IndexArchiveBundle implements CrawlConstants /** - * Gets the description, count of summaries, and number of partions of the - * summaries store in the supplied directory + * Gets the description, count of summaries, and number of partitions of the + * summaries store in the supplied directory. If the file arctype.txt + * exist, this is view as a dummy index archive for the sole purpose of + * allowing conversions of downloaded data such as arc files into + * Yioop! format. * * @param string path to a directory containing a summaries WebArchiveBundle * @return array summary of the given archive */ public static function getArchiveInfo($dir_name) { + if(file_exists($dir_name."/arc_description.txt")) { + $crawl = array(); + $info = array(); + $crawl['DESCRIPTION'] = substr( + file_get_contents($dir_name."/arc_description.txt"), 0, 256); + $crawl['ARCFILE'] = true; + $info['VISITED_URLS_COUNT'] = 0; + $info['COUNT'] = 0; + $info['NUM_DOCS_PER_PARTITION'] = 0; + $info['WRITE_PARTITION'] = 0; + $info['DESCRIPTION'] = serialize($crawl); + + return $info; + } return WebArchiveBundle::getArchiveInfo($dir_name."/summaries"); } } ?> - diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index d7f5dc8af..8f2669f25 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -110,7 +110,6 @@ class GroupIterator extends IndexBundleIterator * * @param object $index_bundle_iterator to use as a source of documents * to iterate over - */ function __construct($index_bundle_iterator) { diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php index fc8bcdddf..4fb1a501a 100755 --- a/lib/phrase_parser.php +++ b/lib/phrase_parser.php @@ -34,9 +34,12 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** - * Load the stem word function, if necessary + * Load the stem word functions, if necessary */ -require_once BASE_DIR."/lib/porter_stemmer.php"; +foreach(glob(BASE_DIR."/lib/stemmers/*_stemmer.php") + as $filename) { + require_once $filename; +} /** * Reads in constants used as enums used for storing web sites @@ -54,19 +57,28 @@ require_once BASE_DIR."/lib/crawl_constants.php"; */ class PhraseParser { + /** + * Language tags and their corresponding stemmer + * @var array + */ + static $STEMMERS = array( + 'en' => "EnStemmer", + 'en-US' => "EnStemmer", + 'en-GB' => "EnStemmer", + 'en-CA' => "EnStemmer", + ); /** * Converts a summary of a web page into a string of space separated words * - * @param array $page associateive array of page summary data. Contains + * @param array $page associative array of page summary data. Contains * title, description, and links fields * @return string the concatenated words extracted from the page summary */ static function extractWordStringPageSummary($page) { - $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\|"; - $title_phrase_string = mb_ereg_replace($punct, " ", + $title_phrase_string = mb_ereg_replace(PUNCT, " ", $page[CrawlConstants::TITLE]); - $description_phrase_string = mb_ereg_replace($punct, " ", + $description_phrase_string = mb_ereg_replace(PUNCT, " ", $page[CrawlConstants::DESCRIPTION]); $page_string = $title_phrase_string . " " . $description_phrase_string; @@ -81,17 +93,18 @@ class PhraseParser * * @param string $string subject to extract phrases from * @param int $len longest length of phrases to consider + * @param string $lang locale tag for stemming * @return array pairs of the form (phrase, number of occurrences) */ static function extractPhrasesAndCount($string, - $len = MAX_PHRASE_LEN) + $len = MAX_PHRASE_LEN, $lang = NULL) { - $phrases = array(); for($i = 0; $i < $len; $i++) { $phrases = - array_merge($phrases,self::extractPhrasesOfLength($string, $i)); + array_merge($phrases, + self::extractPhrasesOfLength($string, $i, $lang)); } $phrase_counts = array_count_values($phrases); @@ -105,15 +118,17 @@ class PhraseParser * * @param string $string subject to extract phrases from * @param int $len length of phrases to consider + * @param string $lang locale tag for stemming * @return array pairs of the form (phrase, number of occurrences) */ - static function extractPhrasesOfLength($string, $phrase_len) + static function extractPhrasesOfLength($string, $phrase_len, $lang = NULL) { $phrases = array(); for($i = 0; $i < $phrase_len; $i++) { $phrases = array_merge($phrases, - self::extractPhrasesOfLengthOffset($string, $phrase_len, $i)); + self::extractPhrasesOfLengthOffset($string, + $phrase_len, $i, $lang)); } return $phrases; @@ -128,17 +143,21 @@ class PhraseParser * @param string $string subject to extract phrases from * @param int $len length of phrases to consider * @param int $offset the first word to begin with + * @param string $lang locale tag for stemming * @return array pairs of the form (phrase, number of occurrences) */ static function extractPhrasesOfLengthOffset($string, - $phrase_len, $offset) + $phrase_len, $offset, $lang = NULL) { - $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\||\&"; - $words = mb_split("[[:space:]]|".$punct, $string); + $words = mb_split("[[:space:]]|".PUNCT, $string); $stems = array(); - + if(isset(self::$STEMMERS[$lang])) { + $stemmer = self::$STEMMERS[$lang]; + } else { + $stemmer = NULL; + } for($i = $offset; $i < count($words); $i++) { if($words[$i] == "") {continue;} @@ -149,8 +168,9 @@ class PhraseParser } $pre_stem = mb_strtolower($words[$i]); - if(strlen($pre_stem) == mb_strlen($pre_stem)) { - $stem = PorterStemmer::stem($pre_stem); + + if($stemmer != NULL) { + $stem = $stemmer::stem($pre_stem); } else { $stem = $pre_stem; } diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php index 9aef48aff..732000193 100755 --- a/lib/processors/html_processor.php +++ b/lib/processors/html_processor.php @@ -72,10 +72,11 @@ class HtmlProcessor extends TextProcessor if(is_string($page)) { $dom = self::dom($page); if($dom !==false && self::checkMetaRobots($dom)) { + $summary[self::LANG] = self::lang($dom); $summary[self::TITLE] = self::title($dom); $summary[self::DESCRIPTION] = self::description($dom); $summary[self::LINKS] = self::links($dom, $url); - + $summary[self::PAGE] = $page; if(strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) { //maybe not html? treat as text still try to get urls @@ -88,9 +89,7 @@ class HtmlProcessor extends TextProcessor } - static function processDom($dom, $url) - { - } + /** * Return a document object based on a string containing the contents of @@ -132,6 +131,23 @@ class HtmlProcessor extends TextProcessor return true; } + /** + * Determines the language of the html document by looking at the root + * language attribute + * + * @param object $dom - a document object to check the language of + * + * @return string language tag for guessed language + + */ + static function lang($dom) + { + $xpath = new DOMXPath($dom); + $html = $xpath->evaluate("/html"); + $lang = $html->item(0)->getAttribute('lang'); + return $lang; + } + /** * Returns html head title of a webpage based on its document object * diff --git a/lib/processors/rss_processor.php b/lib/processors/rss_processor.php index 6b908b9b6..0ea3481b4 100644 --- a/lib/processors/rss_processor.php +++ b/lib/processors/rss_processor.php @@ -73,6 +73,7 @@ class RssProcessor extends TextProcessor $dom = self::dom($page); if($dom !==false) { + $summary[self::LANG] = self::lang($dom); $summary[self::TITLE] = self::title($dom); $summary[self::DESCRIPTION] = self::description($dom); $summary[self::LINKS] = self::links($dom, $url); @@ -84,12 +85,28 @@ class RssProcessor extends TextProcessor } } } - return $summary; } + /** + * Determines the language of the rss document by looking at the channel + * language tag + * + * @param object $dom - a document object to check the language of + * + * @return string language tag for guessed language + */ + static function lang($dom) + { + $xpath = new DOMXPath($dom); + $languages = $xpath->evaluate("/rss/channel/language"); + if($languages && is_object($languages)) { + return $languages->item(0)->textContent; + } + return NULL; + } /** * Return a document object based on a string containing the contents of diff --git a/lib/processors/sitemap_processor.php b/lib/processors/sitemap_processor.php index 7257b9332..f3be9eded 100644 --- a/lib/processors/sitemap_processor.php +++ b/lib/processors/sitemap_processor.php @@ -80,6 +80,9 @@ class SitemapProcessor extends TextProcessor $summary = parent::process($page, $url); } $summary[self::JUST_METAS] = true; + } else { + $summary = parent::process($page, $url); + $summary[self::JUST_METAS] = true; } } diff --git a/lib/processors/xml_processor.php b/lib/processors/xml_processor.php index 0574d5e23..5278c876c 100644 --- a/lib/processors/xml_processor.php +++ b/lib/processors/xml_processor.php @@ -89,7 +89,6 @@ class XmlProcessor extends TextProcessor $root_name = isset($dom->documentElement->nodeName) ? $dom->documentElement->nodeName : ""; unset($dom); - switch ($root_name) { case "rss": diff --git a/lib/porter_stemmer.php b/lib/stemmers/en_stemmer.php similarity index 99% rename from lib/porter_stemmer.php rename to lib/stemmers/en_stemmer.php index 50521d70f..54f57eb02 100755 --- a/lib/porter_stemmer.php +++ b/lib/stemmers/en_stemmer.php @@ -46,7 +46,7 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} * @subpackage library */ -class PorterStemmer +class EnStemmer { /** diff --git a/locale/ar/configure.ini b/locale/ar/configure.ini index 7341daf77..b2ca140e1 100755 --- a/locale/ar/configure.ini +++ b/locale/ar/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "" ; search_view.php line: 145 search_view_score = "" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/de/configure.ini b/locale/de/configure.ini index 5a0ec1b10..20bb1e1f8 100755 --- a/locale/de/configure.ini +++ b/locale/de/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "" ; search_view.php line: 145 search_view_score = "" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/en-US/configure.ini b/locale/en-US/configure.ini index 9a972dd4a..b4f5ac38b 100755 --- a/locale/en-US/configure.ini +++ b/locale/en-US/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "Use options below" ; admin_controller.php line: 798 admin_controller_previous_crawl = "Previous Crawl:" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "Breadth First" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "Page Importance" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "Updating Seed Site Info!" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "Select Crawl" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "Unnamed Crawl" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "Crawl Mix Created!" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "Mix to Delete Does not Exist!" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "Delete" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "Crawl Mix Changes Saved!" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "Setting Crawl To Use as Index" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "Mix to Delete Does not Exist!" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "Crawl Mix Deleted!" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "Select Locale" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "Locale Added!" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "Locale Does Not Exist!" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "Locale Deleted" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "Locale Strings Updated!" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "configs/config.php not web server writable." ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "Work directory needs to be writable by web server. " ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "php.ini directive post_max_size needs to be at least 16M" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "The following required items were missing: %s" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "The following optional items were missing: %s" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "Check Passed." ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "Using configs/local_config.php so changing work directory above may not work." ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "Work Directory Set! You may need to re-login!" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "Please Name Your robot" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "Working Directory and Profile Created!" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "Unable to Update config.php File!" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "Unable to Create Profile!" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "Work Directory is Invalid! Cannot Create Profile!" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "Work Directory is Invalid! Cannot Create Profile!" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "Problem Updating Database!" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "Profile Updated!" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "There was a Problem Updating Profile!" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "Please Describe Your Robot" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "Search Auxiliary Links Displayed" ; configure_element.php line: 204 configure_element_cache_link = "Cache" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "Similar" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "Inlinks" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "IP address" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "Crawl Robot Set-up" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "Crawl Robot Name:" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "Robot Description" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "Submit" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "Back" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "Edit Crawl Options" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "Get Crawl Options From:" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "Web Crawl" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "Archive Crawl" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "Crawl Order:" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "Restrict Sites By Url:" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "Allowed To Crawl Sites" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "Disallowed Sites" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "Seed Sites" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "Crawl or Arc Folder to Re-index:" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "Meta Words" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "Word" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "URL Pattern" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "Word" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "URL Pattern" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "Save Options" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "Rel: %s " ; search_view.php line: 145 search_view_score = "Score %s" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "Cached" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "View as text" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "Similar" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "Inlinks" ; ; settings_view.php line: 76 diff --git a/locale/en-US/statistics.txt b/locale/en-US/statistics.txt index 5a165df53..b6bef56f0 100755 --- a/locale/en-US/statistics.txt +++ b/locale/en-US/statistics.txt @@ -1 +1 @@ -d:100; \ No newline at end of file +d:99; \ No newline at end of file diff --git a/locale/es/configure.ini b/locale/es/configure.ini index ae51e0557..68c631b15 100755 --- a/locale/es/configure.ini +++ b/locale/es/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "" ; search_view.php line: 145 search_view_score = "" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/fr-FR/configure.ini b/locale/fr-FR/configure.ini index 8595e2036..d94384292 100755 --- a/locale/fr-FR/configure.ini +++ b/locale/fr-FR/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "Pertinence: %s" ; search_view.php line: 145 search_view_score = "Total: %s" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "En Cache" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "Version texte" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "Pages similaires" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/he/configure.ini b/locale/he/configure.ini index ec49de587..ebbfe4d2d 100755 --- a/locale/he/configure.ini +++ b/locale/he/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "" ; search_view.php line: 145 search_view_score = "" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/in-ID/configure.ini b/locale/in-ID/configure.ini index eaa76a04e..0a4a6af5f 100755 --- a/locale/in-ID/configure.ini +++ b/locale/in-ID/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "Pilih name locale" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "Locale telah ditambah" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "Locale tidak ditemukan" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "Locale telah dihapus" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "" ; search_view.php line: 145 search_view_score = "" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/it/configure.ini b/locale/it/configure.ini index cda233be8..3e58e164b 100755 --- a/locale/it/configure.ini +++ b/locale/it/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "" ; search_view.php line: 145 search_view_score = "" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/ja/configure.ini b/locale/ja/configure.ini index b09a7258c..fce75371f 100755 --- a/locale/ja/configure.ini +++ b/locale/ja/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "幅優先" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "ページの重要性" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "シッド情報の更新" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "指数のための検索設定する。" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "選択ローケル" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "ローケルが追加しました" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "ローケルは存在しません" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "ローケルを削除しました" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "ローケルストリングを編集しました" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "作業ディレクトリの設定しました。もう一度ログインしてください。" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "ボット名を入力してください。" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "作業ディレクトリとプロフィールの作成しました。" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "config.phpファイルのできない。更新" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "プロフィールを作成できない。" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "無効な作業ディレクト。プロフィールを作成できない。" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "無効な作業ディレクト。プロフィールを作成できない。" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "ディータベースの更新ない" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "プロフィールの変更できました。" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "プロフィールの変更できない。" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "ロボットの説明してください。" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "検索ロボット設定" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "ロボット名" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "ロボット説明" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "サブミット" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "戻る" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "検索オプションの編集" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "検索の順序" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "URLで制限" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "検索ができます" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "検索はできません" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "シッドサイト" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "保存オプション" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "関連:%s" ; search_view.php line: 145 search_view_score = "スコア %s" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "キャッシューしました。" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "テクストビュー" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "同じビュー" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/ja/statistics.txt b/locale/ja/statistics.txt index 2c43a0adb..d1d9ed5ab 100755 --- a/locale/ja/statistics.txt +++ b/locale/ja/statistics.txt @@ -1 +1 @@ -d:73; \ No newline at end of file +d:72; \ No newline at end of file diff --git a/locale/ko/configure.ini b/locale/ko/configure.ini index 3e355fd94..26375e4ef 100755 --- a/locale/ko/configure.ini +++ b/locale/ko/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "너비 우선" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "페이지 중요성" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "씨드 사이트 업데이트" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "크롤을 인덱스로써 사용하기 지정" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "로케일을 선택하여 주십시요." ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "로케일 추가!!" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "로케일이 존재하지 않습니다." ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "로케일을 삭제 하였습니다." ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "로케일 지정 문자열을 업데이트 하였습니다." ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "작업 디렉토리가 지정 됐습니다. 다시 로그인이 필요할수 있습니다." ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "로봇 이름을 정해 주십시요." ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "작업 디렉토리와 프로필이 생성됐습니다." ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "config.php 파일을 업데이트 실패했습니다." ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "프로필을 생성할수 없습니다." ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "작업 디렉토리가 올바르지 않습니다. 프로필을 생성할수 없습니다." ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "작업 디렉토리가 올바르지 않습니다. 프로필을 생성할수 없습니다." ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "데이터베이스를 업데이트하는데 문제가 발생했습니다." ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "프로필을 업데이트 했습니다." ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "프로필을 업데이트하는데 문제가 발생했습니다." ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "당신의 로봇을 기술해 주십시요." ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "크롤 로봇 설정" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "로봇 기술 " ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "크롤 로봇 이름:" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "제출 " ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "뒤" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "크롤 옵션들 편집" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "크롤 순서:" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "사이트들을 주소로 제한:" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "크롤을 허가한 사이트들" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "허가 하지않은 사이트들" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "씨드 사이트들" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "옵션들 저장하기" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "관련성: %s " ; search_view.php line: 145 search_view_score = "점수 %s" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "캐시 됀것" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "일반 텍스트로써 보기" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "유사성" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "인링크" ; ; settings_view.php line: 76 diff --git a/locale/ko/statistics.txt b/locale/ko/statistics.txt index 23fc7ec4f..77bbfe053 100755 --- a/locale/ko/statistics.txt +++ b/locale/ko/statistics.txt @@ -1 +1 @@ -d:75; \ No newline at end of file +d:74; \ No newline at end of file diff --git a/locale/pl/configure.ini b/locale/pl/configure.ini index e3a01fb2f..2d54ef5d6 100755 --- a/locale/pl/configure.ini +++ b/locale/pl/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "" ; search_view.php line: 145 search_view_score = "" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/pt/configure.ini b/locale/pt/configure.ini index a86ee5e49..358ef6893 100755 --- a/locale/pt/configure.ini +++ b/locale/pt/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "" ; search_view.php line: 145 search_view_score = "" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/ru/configure.ini b/locale/ru/configure.ini index d1a985623..63e9d821d 100755 --- a/locale/ru/configure.ini +++ b/locale/ru/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "" ; search_view.php line: 145 search_view_score = "" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/th/configure.ini b/locale/th/configure.ini index fc83e6e22..5183309b8 100755 --- a/locale/th/configure.ini +++ b/locale/th/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "" ; search_view.php line: 145 search_view_score = "" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/vi-VN/configure.ini b/locale/vi-VN/configure.ini index a8262559b..3b153b11d 100755 --- a/locale/vi-VN/configure.ini +++ b/locale/vi-VN/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "Bề rộng đầu tiên" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "Trang quan trọng" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "Cập nhật thông tin trang web hạt giống" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "Thiết lập thu thập dữ liệu để sử dụng làm chỉ mục" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "Chọn miền địa phương" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "Miền địa phương thêm vào" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "Miền địa phương không tồn tại" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "Xóa miền địa phương" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "Chuỗi Địa phương được cập nhật" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "Công việc thiết lập thư mục bị đông cứng (Bạn có thể cần phải đăng nhập)" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "Đặt tên cho rô bô của bạn" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "Thư mục làm việc và hồ sơ được tạo ra" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "Không thể cập nhật hồ sơ config.php" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "Không thể tạo hồ sơ" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "Công tác thư mục không hợp lệ" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "Công tác thư mục không hợp lệ" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "Vấn đề cập nhật cơ sở dữ liệu" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "Hồ sơ được cập nhật" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "Có sự trở ngaị về việc cập nhật hồ sơ " ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "Diễn tả rô bô của bạn" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "Mô tả rô-bốt" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "Trở lại" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "Lưu những lựa chọn" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "Thích hợp:" ; search_view.php line: 145 search_view_score = "Điểm: %s" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "Trang gốc" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "Trang Web Bắng Chữ" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "Tương Tự" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/locale/zh-CN/configure.ini b/locale/zh-CN/configure.ini index 709186d4a..73aaa3def 100755 --- a/locale/zh-CN/configure.ini +++ b/locale/zh-CN/configure.ini @@ -154,109 +154,109 @@ admin_controller_use_below = "" ; admin_controller.php line: 798 admin_controller_previous_crawl = "" ; -; admin_controller.php line: 820 +; admin_controller.php line: 829 admin_controller_breadth_first = "深度優先" ; -; admin_controller.php line: 822 +; admin_controller.php line: 831 admin_controller_page_importance = "網頁重要性" ; -; admin_controller.php line: 885 +; admin_controller.php line: 894 admin_controller_update_seed_info = "" ; -; admin_controller.php line: 974 +; admin_controller.php line: 983 admin_controller_select_crawl = "" ; -; admin_controller.php line: 998 +; admin_controller.php line: 1007 admin_controller_unnamed = "" ; -; admin_controller.php line: 1003 +; admin_controller.php line: 1012 admin_controller_mix_created = "" ; -; admin_controller.php line: 1012 +; admin_controller.php line: 1021 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1034 +; admin_controller.php line: 1043 editcrawl_view_delete = "" ; -; admin_controller.php line: 1079 +; admin_controller.php line: 1088 admin_controller_mix_saved = "" ; -; admin_controller.php line: 1085 +; admin_controller.php line: 1094 admin_controller_set_index = "" ; -; admin_controller.php line: 1095 +; admin_controller.php line: 1104 admin_controller_mix_doesnt_exists = "" ; -; admin_controller.php line: 1103 +; admin_controller.php line: 1112 admin_controller_mix_deleted = "" ; -; admin_controller.php line: 1139 +; admin_controller.php line: 1148 admin_controller_select_localename = "" ; -; admin_controller.php line: 1182 +; admin_controller.php line: 1191 admin_controller_locale_added = "" ; -; admin_controller.php line: 1189 +; admin_controller.php line: 1198 admin_controller_localename_doesnt_exists = "" ; -; admin_controller.php line: 1198 +; admin_controller.php line: 1207 admin_controller_localename_deleted = "" ; -; admin_controller.php line: 1218 +; admin_controller.php line: 1227 admin_controller_localestrings_updated = "" ; -; admin_controller.php line: 1272 +; admin_controller.php line: 1281 admin_controller_no_write_config_php = "" ; -; admin_controller.php line: 1277 +; admin_controller.php line: 1286 admin_controller_no_write_work_dir = "" ; -; admin_controller.php line: 1282 +; admin_controller.php line: 1291 admin_controller_post_size_small = "" ; -; admin_controller.php line: 1288 +; admin_controller.php line: 1297 admin_controller_missing_required = "" ; -; admin_controller.php line: 1304 +; admin_controller.php line: 1313 admin_controller_missing_optional = "" ; -; admin_controller.php line: 1309 +; admin_controller.php line: 1318 admin_controller_check_passed = "" ; -; admin_controller.php line: 1314 +; admin_controller.php line: 1323 admin_controller_using_local_config = "" ; -; admin_controller.php line: 1384 +; admin_controller.php line: 1393 admin_controller_configure_work_dir_set = "" ; -; admin_controller.php line: 1396 +; admin_controller.php line: 1405 admin_controller_name_your_bot = "" ; -; admin_controller.php line: 1405 +; admin_controller.php line: 1414 admin_controller_configure_work_profile_made = "" ; -; admin_controller.php line: 1413 +; admin_controller.php line: 1422 admin_controller_configure_no_set_config = "" ; -; admin_controller.php line: 1424 +; admin_controller.php line: 1433 admin_controller_configure_no_create_profile = "" ; -; admin_controller.php line: 1433 +; admin_controller.php line: 1442 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1444 +; admin_controller.php line: 1453 admin_controller_configure_work_dir_invalid = "" ; -; admin_controller.php line: 1518 +; admin_controller.php line: 1528 admin_controller_configure_no_change_db = "" ; -; admin_controller.php line: 1532 +; admin_controller.php line: 1542 admin_controller_configure_profile_change = "" ; -; admin_controller.php line: 1546 +; admin_controller.php line: 1556 admin_controller_configure_no_change_profile = "" ; -; admin_controller.php line: 1582 +; admin_controller.php line: 1592 admin_controller_describe_robot = "" ; ; search_controller.php line: 119 @@ -437,25 +437,25 @@ configure_element_search_results = "" ; configure_element.php line: 204 configure_element_cache_link = "" ; -; configure_element.php line: 209 +; configure_element.php line: 210 configure_element_similar_link = "" ; -; configure_element.php line: 214 +; configure_element.php line: 215 configure_element_in_link = "" ; -; configure_element.php line: 219 +; configure_element.php line: 220 configure_element_ip_link = "" ; -; configure_element.php line: 223 +; configure_element.php line: 224 configure_element_crawl_robot = "" ; -; configure_element.php line: 225 +; configure_element.php line: 226 configure_element_robot_name = "" ; -; configure_element.php line: 232 +; configure_element.php line: 233 configure_element_robot_description = "" ; -; configure_element.php line: 241 +; configure_element.php line: 242 configure_element_submit = "" ; ; crawloptions_element.php line: 62 @@ -464,40 +464,49 @@ crawloptions_element_back_to_manage = "" ; crawloptions_element.php line: 64 crawloptions_element_edit_crawl_options = "" ; -; crawloptions_element.php line: 74 +; crawloptions_element.php line: 75 crawloptions_element_load_options = "" ; -; crawloptions_element.php line: 79 +; crawloptions_element.php line: 83 +crawloptions_element_web_crawl = "" +; +; crawloptions_element.php line: 87 +crawloptions_element_archive_crawl = "" +; +; crawloptions_element.php line: 92 crawloptions_element_crawl_order = "" ; -; crawloptions_element.php line: 85 +; crawloptions_element.php line: 98 crawloptions_element_restrict_by_url = "" ; -; crawloptions_element.php line: 92 +; crawloptions_element.php line: 105 crawloptions_element_allowed_to_crawl = "" ; -; crawloptions_element.php line: 97 +; crawloptions_element.php line: 110 crawloptions_element_disallowed_to_crawl = "" ; -; crawloptions_element.php line: 103 +; crawloptions_element.php line: 116 crawloptions_element_seed_sites = "" ; -; crawloptions_element.php line: 108 +; crawloptions_element.php line: 123 +crawloptions_element_reindex_crawl = "" +; +; crawloptions_element.php line: 130 crawloptions_element_meta_words = "" ; -; crawloptions_element.php line: 110 +; crawloptions_element.php line: 132 crawloptions_element_word = "" ; -; crawloptions_element.php line: 112 +; crawloptions_element.php line: 134 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 118 +; crawloptions_element.php line: 140 crawloptions_element_word = "" ; -; crawloptions_element.php line: 124 +; crawloptions_element.php line: 146 crawloptions_element_url_pattern = "" ; -; crawloptions_element.php line: 143 +; crawloptions_element.php line: 165 crawloptions_element_save_options = "" ; ; editlocales_element.php line: 62 @@ -786,16 +795,16 @@ search_view_relevancy = "關聯度: %s 趴" ; search_view.php line: 145 search_view_score = "分數" ; -; search_view.php line: 158 +; search_view.php line: 159 search_view_cache = "" ; -; search_view.php line: 161 +; search_view.php line: 162 search_view_as_text = "" ; -; search_view.php line: 168 +; search_view.php line: 174 search_view_similar = "相似" ; -; search_view.php line: 173 +; search_view.php line: 184 search_view_inlink = "" ; ; settings_view.php line: 76 diff --git a/models/crawl_model.php b/models/crawl_model.php index 34243e3f4..059be96e9 100755 --- a/models/crawl_model.php +++ b/models/crawl_model.php @@ -175,13 +175,18 @@ class CrawlModel extends Model implements CrawlConstants /** * Gets a list of all index archives of crawls that have been conducted + * + * @param bool $return_arc_bundles whether index bundles used for indexing + * arc or other archive bundles should be included in the lsit + * @param bool $return_recrawls whether index archive bundles generated as + * a result of recrawling should be included in the result * * @return array Available IndexArchiveBundle directories and - * their meta information this meta information includes the time of the - * crawl, its description, the number of pages downloaded, and the number - * of partitions used in storing the inverted index + * their meta information this meta information includes the time of + * the crawl, its description, the number of pages downloaded, and the + * number of partitions used in storing the inverted index */ - function getCrawlList() + function getCrawlList($return_arc_bundles = false, $return_recrawls = false) { $list = array(); $dirs = glob(CRAWL_DIR.'/cache/*', GLOB_ONLYDIR); @@ -194,7 +199,23 @@ class CrawlModel extends Model implements CrawlConstants substr($pre_timestamp, strlen(self::index_data_base_name)); $info = IndexArchiveBundle::getArchiveInfo($dir); $index_info = unserialize($info['DESCRIPTION']); - $crawl['DESCRIPTION'] = $index_info['DESCRIPTION']; + $crawl['DESCRIPTION'] = ""; + if(!$return_arc_bundles && isset($index_info['ARCFILE'])) { + continue; + } else if ($return_arc_bundles + && isset($index_info['ARCFILE'])) { + $crawl['DESCRIPTION'] = "ARCFILE::"; + } + if(!$return_recrawls && + isset($index_info[self::CRAWL_TYPE]) && + $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) { + continue; + } else if($return_recrawls && + isset($index_info[self::CRAWL_TYPE]) && + $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) { + $crawl['DESCRIPTION'] = "RECRAWL::"; + } + $crawl['DESCRIPTION'] .= $index_info['DESCRIPTION']; $crawl['VISITED_URLS_COUNT'] = isset($info['VISITED_URLS_COUNT']) ? $info['VISITED_URLS_COUNT'] : 0; @@ -299,7 +320,7 @@ class CrawlModel extends Model implements CrawlConstants } /** - * Returns whether the supplied timestamp corresponds to a crawl mix + * Returns whether the supplied timestamp corresponds to a crawl mix * * @param string timestamp of the requested crawl mix * @return bool true if it does; false otherwise @@ -378,6 +399,12 @@ class CrawlModel extends Model implements CrawlConstants $index_info = unserialize($info['DESCRIPTION']); $seed_info['general']["restrict_sites_by_url"] = $index_info[self::RESTRICT_SITES_BY_URL]; + $seed_info['general']["crawl_type"] = + (isset($index_info[self::CRAWL_TYPE])) ? + $index_info[self::CRAWL_TYPE] : self::WEB_CRAWL; + $seed_info['general']["crawl_index"] = + (isset($index_info[self::CRAWL_INDEX])) ? + $index_info[self::CRAWL_INDEX] : ''; $seed_info['general']["crawl_order"] = $index_info[self::CRAWL_ORDER]; $site_types = array( @@ -431,6 +458,9 @@ class CrawlModel extends Model implements CrawlConstants */ function setSeedInfo($info) { + if(!isset($info['general']['crawl_index'])) { + $info['general']['crawl_index']='12345678'; + } $n = array(); $n[] = <<<EOT ; ***** BEGIN LICENSE BLOCK ***** @@ -458,6 +488,8 @@ class CrawlModel extends Model implements CrawlConstants EOT; $n[] = '[general]'; $n[] = "crawl_order = '".$info['general']['crawl_order']."';"; + $n[] = "crawl_type = '".$info['general']['crawl_type']."';"; + $n[] = "crawl_index = '".$info['general']['crawl_index']."';"; $bool_string = ($info['general']['restrict_sites_by_url']) ? "true" : "false"; $n[] = "restrict_sites_by_url = $bool_string;"; diff --git a/models/datasources/datasource_manager.php b/models/datasources/datasource_manager.php index ad028b83e..59083677b 100755 --- a/models/datasources/datasource_manager.php +++ b/models/datasources/datasource_manager.php @@ -260,4 +260,3 @@ abstract class DatasourceManager } ?> - diff --git a/models/phrase_model.php b/models/phrase_model.php index ea7b21360..cff0856b6 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -255,9 +255,10 @@ class PhraseModel extends Model { $phrase = " ".$phrase; $phrase_string = $phrase; - $meta_words = array('link\:', 'site\:', - 'filetype\:', 'info\:', '\-', - 'index\:', 'i\:', 'ip\:', 'weight\:', 'w\:', 'u\:'); + $meta_words = array('link\:', 'site\:', 'version\:', 'modified\:', + 'filetype\:', 'info\:', '\-', 'os\:', 'server\:', 'date\:', + 'index\:', 'i\:', 'ip\:', 'weight\:', 'w\:', 'u\:', + 'lang\:'); $index_name = $this->index_name; $weight = 1; $found_metas = array(); @@ -265,7 +266,8 @@ class PhraseModel extends Model foreach($meta_words as $meta_word) { $pattern = "/(\s)($meta_word(\S)+)/"; preg_match_all($pattern, $phrase, $matches); - if(in_array($meta_word, array('link\:', 'site\:', + if(in_array($meta_word, array('link\:', 'site\:', 'os\:', + 'server\:', 'version\:', 'modified\:', 'date\:', 'lang\:', 'filetype\:', 'ip\:', 'info\:', 'u\:') )) { $found_metas = array_merge($found_metas, $matches[2]); } else if($meta_word == '\-') { @@ -289,8 +291,8 @@ class PhraseModel extends Model $index_archive_name = self::index_data_base_name . $index_name; $index_archive = new IndexArchiveBundle( CRAWL_DIR.'/cache/'.$index_archive_name); - $punct = "\.|\,|\:|\;|\"|\'|\`|\[|\]|\{|\}|\(|\)|\!|\||\&"; - $phrase_string = mb_ereg_replace($punct, " ", $phrase_string); + + $phrase_string = mb_ereg_replace(PUNCT, " ", $phrase_string); $phrase_string = preg_replace("/(\s)+/", " ", $phrase_string); /* we search using the stemmed words, but we format snippets in the @@ -298,7 +300,8 @@ class PhraseModel extends Model */ $query_words = explode(" ", $phrase_string); //not stemmed $base_words = - array_keys(PhraseParser::extractPhrasesAndCount($phrase_string)); + array_keys(PhraseParser::extractPhrasesAndCount($phrase_string, + MAX_PHRASE_LEN, getLocaleTag())); //stemmed $words = array_merge($base_words, $found_metas); if(isset($words) && count($words) == 1) { @@ -372,7 +375,7 @@ class PhraseModel extends Model * pretty weak. For now we pick the $num many words which appear in the * fewest documents. * - * @param string $craw_item a page summary + * @param string $crawl_item a page summary * @param int $num number of key phrase to return * @return array an array of most selective key phrases */ @@ -454,7 +457,8 @@ class PhraseModel extends Model $query_iterator = $this->getQueryIterator($word_structs); $num_retrieved = 0; $pages = array(); - while(is_array($next_docs = $query_iterator->nextDocsWithWord()) && + while(is_object($query_iterator) && + is_array($next_docs = $query_iterator->nextDocsWithWord()) && $num_retrieved < $to_retrieve) { foreach($next_docs as $doc_key => $doc_info) { $summary = & $doc_info[CrawlConstants::SUMMARY]; diff --git a/views/elements/crawloptions_element.php b/views/elements/crawloptions_element.php index a4bab69d1..e93bf93d1 100644 --- a/views/elements/crawloptions_element.php +++ b/views/elements/crawloptions_element.php @@ -63,7 +63,6 @@ class CrawloptionsElement extends Element ><?php e(tl('crawloptions_element_back_to_manage'))?></a> </div> <h2><?php e(tl('crawloptions_element_edit_crawl_options'))?></h2> - <form id="crawloptionsForm" method="post" action=''> <input type="hidden" name="c" value="admin" /> <input type="hidden" name="YIOOP_TOKEN" value="<?php @@ -71,6 +70,20 @@ class CrawloptionsElement extends Element <input type="hidden" name="a" value="manageCrawls" /> <input type="hidden" name="arg" value="options" /> <input type="hidden" name="posted" value="posted" /> + <input type="hidden" id='crawl-type' name="crawl_type" value="<?php + e($data['crawl_type'])?>" /> + <ul class='tabmenu-list'> + <li><a href="javascript:switchTab('webcrawltab', 'archivetab');" + id='webcrawltabitem' + class="<?php e($data['web_crawl_active']); ?>"><?php + e(tl('crawloptions_element_web_crawl'))?></a></li> + <li><a href="javascript:switchTab('archivetab', 'webcrawltab');" + id='archivetabitem' + class="<?php e($data['archive_crawl_active']); ?>"><?php + e(tl('crawloptions_element_archive_crawl'))?></a></li> + </ul> + <div class='tabmenu-content'> + <div id='webcrawltab'> <div class="topmargin"><label for="load-options"><b><?php e(tl('crawloptions_element_load_options'))?></b></label><?php $this->view->optionsHelper->render("load-options", "load_option", @@ -105,6 +118,15 @@ class CrawloptionsElement extends Element <textarea class="talltextarea" name="seed_sites" ><?php e($data['seed_sites']); ?></textarea> + </div> + <div id='archivetab'> + <div class="topmargin"><label for="load-options"><b><?php + e(tl('crawloptions_element_reindex_crawl'))?></b></label><?php + $this->view->optionsHelper->render("crawl-indexes", "crawl_indexes", + $data['available_crawl_indexes'], $data['crawl_index']); + ?></div> + </div> + </div> <div class="topmargin"><b><?php e(tl('crawloptions_element_meta_words'))?></b></div> <table class="metawordstable"> @@ -145,7 +167,23 @@ class CrawloptionsElement extends Element ?></button></div> </form> </div> + <script type="text/javascript"> + + function switchTab(newtab, oldtab) + { + setDisplay(newtab, true); + setDisplay(oldtab, false); + ntab = elt(newtab+"item"); + ntab.className = 'active'; + otab = elt(oldtab+"item"); + otab.className = ''; + ctype = elt('crawl-type'); + ctype.value = (newtab == 'webcrawltab') + ? '<?php e(CrawlConstants::WEB_CRAWL); ?>' : + '<?php e(CrawlConstants::ARCHIVE_CRAWL); ?>'; + } + </script> <?php } }