diff --git a/bin/arc_tool.php b/bin/arc_tool.php index 1e913649e..bbcc5a564 100755 --- a/bin/arc_tool.php +++ b/bin/arc_tool.php @@ -814,7 +814,7 @@ class ArcTool implements CrawlConstants * @param string $archive_name name or path to what was supposed to be * an archive */ - function badFormatMessageAndExit($archive_name, + function badFormatMessageAndExit($archive_name, $allowed_archives = "web or index") { echo <<< EOD @@ -858,14 +858,14 @@ php arc_tool.php mergetiers bundle_name max_tier php arc_tool.php posting bundle_name generation offset or php arc_tool.php posting bundle_name generation offset num - /* returns info about the posting (num many postings) in bundle_name at + /* returns info about the posting (num many postings) in bundle_name at the given generation and offset */ php arc_tool.php reindex bundle_name // reindex the word dictionary in bundle_name php arc_tool.php show bundle_name start num - /* outputs items start through num from bundle_name or name of + /* outputs items start through num from bundle_name or name of non-Yioop archive crawl folder */ @@ -876,4 +876,4 @@ EOD; $arc_tool = new ArcTool(); $arc_tool->start(); -?> +?> \ No newline at end of file diff --git a/bin/fetcher.php b/bin/fetcher.php index 04a63907e..93a309f36 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -503,7 +503,7 @@ class Fetcher implements CrawlConstants $this->to_crawl = array(); } } else if ($this->crawl_type == self::ARCHIVE_CRAWL && - $this->arc_type != "WebArchiveBundle" && + $this->arc_type != "WebArchiveBundle" && $this->arc_type != "") { /* case(2) */ // An archive crawl with data coming from the name server. crawlLog("MAIN LOOP CASE 2 -- ARCHIVE SCHEDULER (NOT RECRAWL)"); @@ -605,7 +605,7 @@ class Fetcher implements CrawlConstants crawlLog("Number of summarized pages ". count($summarized_site_pages)); - $force_send = (isset($info[self::END_ITERATOR]) && + $force_send = (isset($info[self::END_ITERATOR]) && $info[self::END_ITERATOR]) ? true : false; $this->updateFoundSites($summarized_site_pages, $force_send); @@ -905,7 +905,7 @@ class Fetcher implements CrawlConstants } else { $update_num = SEEN_URLS_BEFORE_UPDATE_SCHEDULER; crawlLog("Fetch on crawl {$this->crawl_time} was not ". - "halted properly."); + "halted properly."); crawlLog(" Dumping $update_num from old fetch ". "to try to make a clean re-start."); $count = count($this->to_crawl); @@ -917,7 +917,7 @@ class Fetcher implements CrawlConstants } } } - if(general_is_a($this->arc_type."Iterator", + if(general_is_a($this->arc_type."Iterator", "TextArchiveBundleIterator")) { $result_dir = WORK_DIRECTORY . "/schedules/" . $prefix.self::fetch_archive_iterator.$this->crawl_time; @@ -1034,7 +1034,7 @@ class Fetcher implements CrawlConstants $this->selectCurrentServerAndUpdateIfNeeded(false); $chunk = false; - if(general_is_a($this->arc_type."Iterator", + if(general_is_a($this->arc_type."Iterator", "TextArchiveBundleIterator")) { $archive_iterator = $this->archive_iterator; $chunk = true; @@ -1043,10 +1043,10 @@ class Fetcher implements CrawlConstants TextArchiveBundleIterator::MAX_RECORD_SIZE; if($archive_iterator->buffer_fh && $archive_iterator->current_offset < $max_offset) { - crawlLog("Local Iterator Offset: ". + crawlLog("Local Iterator Offset: ". $archive_iterator->current_offset); crawlLog("Local Max Offset: ". $max_offset); - $info[self::ARC_DATA] = + $info[self::ARC_DATA] = $archive_iterator->nextPages(ARCHIVE_BATCH_SIZE); crawlLog("Time to get archive data from local buffer ". changeInMicrotime($start_time)); @@ -1097,8 +1097,8 @@ class Fetcher implements CrawlConstants if($pages[self::ARC_DATA]) { $archive_iterator->makeBuffer($pages[self::ARC_DATA]); } - if(isset($pages[self::HEADER]) && - is_array($pages[self::HEADER]) && + if(isset($pages[self::HEADER]) && + is_array($pages[self::HEADER]) && $pages[self::HEADER] != array()) { $archive_iterator->header = $pages[self::HEADER]; } @@ -1189,7 +1189,7 @@ class Fetcher implements CrawlConstants 'crawl_order', self::CACHE_PAGES => 'cache_pages', self::INDEXED_FILE_TYPES => 'indexed_file_types', self::RESTRICT_SITES_BY_URL => 'restrict_sites_by_url', - self::ALLOWED_SITES => 'allowed_sites', + self::ALLOWED_SITES => 'allowed_sites', self::DISALLOWED_SITES => 'disallowed_sites'); foreach($update_fields as $info_field => $field) { if(isset($info[$info_field])) { @@ -1200,7 +1200,7 @@ class Fetcher implements CrawlConstants if(isset($info[self::PAGE_RULES]) ){ $rule_string = implode("\n", $info[self::PAGE_RULES]); $rule_string = html_entity_decode($rule_string, ENT_QUOTES); - $this->page_rule_parser = + $this->page_rule_parser = new PageRuleParser($rule_string); } if(isset($info[self::VIDEO_SOURCES])) { @@ -1609,7 +1609,7 @@ class Fetcher implements CrawlConstants * This method attempts to cull from the doc_info struct the * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing * links which of filetype or sites the crawler is forbidden from crawl. - * Then a crude estimate of the informaation contained in the links test: + * Then a crude estimate of the informaation contained in the links test: * strlen(gzip(text)) is used to extract the best remaining links. * * @param array &$doc_info a string with a CrawlConstants::LINKS subarray @@ -1829,7 +1829,7 @@ class Fetcher implements CrawlConstants crawlLog($site_index.". $subdoc_info ".$site[self::URL]); } // end for - if($force_send || ($this->crawl_type == self::WEB_CRAWL && + if($force_send || ($this->crawl_type == self::WEB_CRAWL && count($this->to_crawl) <= 0 && count($this->to_crawl_again) <= 0) || (isset($this->found_sites[self::SEEN_URLS]) && count($this->found_sites[self::SEEN_URLS]) > @@ -2174,7 +2174,7 @@ class Fetcher implements CrawlConstants crawlLog("Trouble sending to the scheduler, response was:"); crawlLog("$info_string"); $info = unserialize($info_string); - if(isset($info[self::STATUS]) && + if(isset($info[self::STATUS]) && $info[self::STATUS] == self::REDO_STATE) { crawlLog("Server requested last item to be re-sent..."); if(isset($info[self::SUMMARY])) { @@ -2422,4 +2422,4 @@ $fetcher = new Fetcher($PAGE_PROCESSORS, NAME_SERVER, PAGE_RANGE_REQUEST, $INDEXED_FILE_TYPES); $fetcher->start(); -?> +?> \ No newline at end of file diff --git a/bin/news_updater.php b/bin/news_updater.php index bb25982df..593d28dc7 100644 --- a/bin/news_updater.php +++ b/bin/news_updater.php @@ -88,6 +88,20 @@ require_once BASE_DIR."/controllers/search_controller.php"; mb_internal_encoding("UTF-8"); mb_regex_encoding("UTF-8"); +if (function_exists('lcfirst') === false) { + /** + * Lower cases the first letter in a string + * + * This function is only defined if the PHP version is before 5.3 + * @param string $str string to be lower cased + * @return string the lower cased string + */ + function lcfirst( $str ) + { + return (string)(strtolower(substr($str, 0, 1)).substr($str, 1)); + } +} + /** * Separate process/command-line script which can be used to update * news sources for Yioop. This is as an alternative to using the web app @@ -111,8 +125,8 @@ class NewsUpdater implements CrawlConstants } /** - * This is the function that should be called to get the newsupdater to - * start to start updating. Calls init to handle the command-line + * This is the function that should be called to get the newsupdater to + * start to start updating. Calls init to handle the command-line * arguments then enters news_updaters main loop */ function start() @@ -169,4 +183,4 @@ class NewsUpdater implements CrawlConstants $news_updater = new NewsUpdater(); $news_updater->start(); -?> +?> \ No newline at end of file diff --git a/configs/config.php b/configs/config.php index c060656c4..5ee27d755 100644 --- a/configs/config.php +++ b/configs/config.php @@ -79,7 +79,7 @@ if(MAINTENANCE_MODE && $_SERVER["SERVER_ADDR"] != $_SERVER["REMOTE_ADDR"]) { } if(!defined('WORK_DIRECTORY')) { -/*+++ The next block of code is machine edited, change at +/*+++ The next block of code is machine edited, change at your own risk, please use configure web page instead +++*/ define('WORK_DIRECTORY', ''); /*++++++*/ @@ -504,4 +504,4 @@ define ('NUM_RESULTS_PER_PAGE', 10); /** Number of recently crawled urls to display on admin screen */ define ('NUM_RECENT_URLS_TO_DISPLAY', 10); -?> +?> \ No newline at end of file diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php index 2e4935c8e..9d4e429d4 100755 --- a/controllers/admin_controller.php +++ b/controllers/admin_controller.php @@ -336,7 +336,7 @@ class AdminController extends Controller implements CrawlConstants "news_process" => tl('admin_controller_news_process'), ); $profile = $this->profileModel->getProfile(WORK_DIRECTORY); - $data['NEWS_MODE'] = isset($profile['NEWS_MODE']) ? + $data['NEWS_MODE'] = isset($profile['NEWS_MODE']) ? $profile['NEWS_MODE']: ""; return $data; } @@ -928,7 +928,7 @@ class AdminController extends Controller implements CrawlConstants $crawl_params[self::DISALLOWED_SITES] = isset($seed_info['disallowed_sites']['url']) ? $seed_info['disallowed_sites']['url'] : array(); - $crawl_params[self::PAGE_RULES] = + $crawl_params[self::PAGE_RULES] = isset($seed_info['page_rules']['rule']) ? $seed_info['page_rules']['rule'] : array(); @@ -1178,7 +1178,7 @@ class AdminController extends Controller implements CrawlConstants } /** * Cleans a string consisting of lines, typically of urls into an array of - * clean lines. This is used in handling data from the crawl options + * clean lines. This is used in handling data from the crawl options * text areas. * * @param string $str contains the url data @@ -1494,13 +1494,13 @@ class AdminController extends Controller implements CrawlConstants foreach($copy_options as $main_option => $sub_options) { foreach($sub_options as $sub_option) { if(isset($seed_loaded[$main_option][$sub_option])) { - $seed_info[$main_option][$sub_option] = + $seed_info[$main_option][$sub_option] = $seed_loaded[$main_option][$sub_option]; } } } if(isset($seed_loaded['page_rules'])) { - $seed_info['page_rules'] = + $seed_info['page_rules'] = $seed_loaded['page_rules']; } $update_flag = true; @@ -1570,7 +1570,7 @@ class AdminController extends Controller implements CrawlConstants if(!isset($_REQUEST['load_option'])) { $data = array_merge($data, $profile); } else { - + $this->updateProfileFields($data, $profile, array('IP_LINK','CACHE_LINK', 'SIMILAR_LINK', 'IN_LINK', 'SIGNIN_LINK', 'SUBSEARCH_LINK','WORD_SUGGEST')); @@ -1652,7 +1652,7 @@ class AdminController extends Controller implements CrawlConstants $data['SCRIPT'] .= "\nswitchTab('testoptionstab',". "'crawltimetab', 'searchtimetab');\n"; } - + $this->crawlModel->setSeedInfo($seed_info); if($change == true && $data['option_type'] != 'test_options') { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >". @@ -1697,7 +1697,7 @@ class AdminController extends Controller implements CrawlConstants $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::LANG] = 'en'; $site[self::JUST_METAS] = false; - if(isset($_REQUEST['page_type']) && + if(isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) { $site[self::TYPE] = $_REQUEST['page_type']; } @@ -1729,7 +1729,7 @@ class AdminController extends Controller implements CrawlConstants print_r($after_process, true), "string"), 75, "\n", true); $rule_string = implode("\n", $seed_info['page_rules']['rule']); $rule_string = html_entity_decode($rule_string, ENT_QUOTES); - $page_rule_parser = + $page_rule_parser = new PageRuleParser($rule_string); $page_rule_parser->executeRuleTrees($site); $after_process = array(); @@ -2073,9 +2073,9 @@ class AdminController extends Controller implements CrawlConstants $data["ELEMENT"] = "machinelogElement"; $filter= ""; if(isset($_REQUEST['f'])) { - $filter = + $filter = $this->clean($_REQUEST['f'], "string"); - } + } $data['filter'] = $filter; $data["REFRESH_LOG"] = "&time=". $data["time"]; $data["LOG_TYPE"] = ""; @@ -2097,7 +2097,7 @@ class AdminController extends Controller implements CrawlConstants } $data["LOG_FILE_DATA"] = $this->machineModel->getLog( $r["name"], NULL, $filter); - $data["REFRESH_LOG"] .= + $data["REFRESH_LOG"] .= "&arg=log&name=".$r['name']; } if($data["time"] >= 1200) { @@ -2581,9 +2581,9 @@ class AdminController extends Controller implements CrawlConstants $data['SCRIPT'] = ""; $data['PROFILE'] = false; - if(isset($_REQUEST['WORK_DIRECTORY']) || (defined('WORK_DIRECTORY') && + if(isset($_REQUEST['WORK_DIRECTORY']) || (defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER) ) { - if(defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') + if(defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER && !isset($_REQUEST['WORK_DIRECTORY'])) { $_REQUEST['WORK_DIRECTORY'] = WORK_DIRECTORY; $_REQUEST['arg'] = "directory"; @@ -2668,7 +2668,7 @@ class AdminController extends Controller implements CrawlConstants $uri = UrlParser::getPath($_SERVER['REQUEST_URI']); $http = (isset($_SERVER['HTTPS'])) ? "https://" : "http://"; - $profile['NAME_SERVER'] = + $profile['NAME_SERVER'] = $http . $_SERVER['SERVER_NAME'] . $uri; $data['NAME_SERVER'] = $profile['NAME_SERVER']; $profile['AUTH_KEY'] = crawlHash( @@ -2680,7 +2680,7 @@ class AdminController extends Controller implements CrawlConstants $data['ROBOT_INSTANCE'] = $profile['ROBOT_INSTANCE']; if($this->profileModel->updateProfile( $data['WORK_DIRECTORY'], array(), $profile)) { - if((defined('WORK_DIRECTORY') && + if((defined('WORK_DIRECTORY') && $data['WORK_DIRECTORY'] == WORK_DIRECTORY) || $this->profileModel->setWorkDirectoryConfigFile( $data['WORK_DIRECTORY'])) { @@ -2743,7 +2743,7 @@ class AdminController extends Controller implements CrawlConstants } break; case "profile": - $this->updateProfileFields($data, $profile, + $this->updateProfileFields($data, $profile, array('USE_FILECACHE', 'USE_MEMCACHE', "WEB_ACCESS", 'RSS_ACCESS', 'API_ACCESS')); $data['DEBUG_LEVEL'] = 0; @@ -2873,7 +2873,7 @@ class AdminController extends Controller implements CrawlConstants setDisplay('advance-robot', {$data['advanced']}); function toggleAdvance() { var advanced = elt('a-settings'); - advanced.value = (advanced.value =='true') + advanced.value = (advanced.value =='true') ? 'false' : 'true'; var value = (advanced.value == 'true') ? true : false; setDisplay('advance-configure', value); @@ -2894,7 +2894,7 @@ EOD; $data['SCRIPT'] .= "elt('locale').onchange = ". "function () { elt('configureProfileForm').submit();};\n"; - + return $data; } @@ -2931,4 +2931,4 @@ EOD; } } } -?> +?> \ No newline at end of file diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php index 2bc8a8003..af45da888 100755 --- a/controllers/fetch_controller.php +++ b/controllers/fetch_controller.php @@ -238,7 +238,7 @@ class FetchController extends Controller implements CrawlConstants } $pages = false; if($archive_iterator && !$archive_iterator->end_of_iterator) { - if(general_is_a($archive_iterator, + if(general_is_a($archive_iterator, "TextArchiveBundleIterator")) { $pages = $archive_iterator->nextChunk(); $chunk = true; @@ -576,4 +576,4 @@ class FetchController extends Controller implements CrawlConstants return $list; } } -?> +?> \ No newline at end of file diff --git a/controllers/search_controller.php b/controllers/search_controller.php index aa8d0a5f3..54820450d 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -113,12 +113,12 @@ class SearchController extends Controller implements CrawlConstants if(!$format_info) { return;} list($view, $web_flag, $raw, $results_per_page, $limit) = $format_info; - list($query, $activity, $arg) = + list($query, $activity, $arg) = $this->initializeUserAndDefaultActivity($data); if($activity == "query" && $this->mirrorHandle()) {return; } - list($index_timestamp, $index_info, $save_timestamp) = + list($index_timestamp, $index_info, $save_timestamp) = $this->initializeIndexInfo($web_flag, $raw, $data); if(isset($_REQUEST['q']) && strlen($_REQUEST['q']) > 0 @@ -161,7 +161,7 @@ class SearchController extends Controller implements CrawlConstants exit(); } - if($web_flag) { + if($web_flag) { $this->addSearchViewData($index_info, $no_query, $raw, $view, $subsearches, $data); } @@ -178,7 +178,7 @@ class SearchController extends Controller implements CrawlConstants /** * Determines how this query is being run and return variables for the view * - * A query might be run as a web-based where HTML is expected as the + * A query might be run as a web-based where HTML is expected as the * output, an RSS query, an API query, or as a serial query from a * name_server or mirror instance back to one of the other queue servers * in a Yioop installation. A query might also request different numbers @@ -288,7 +288,7 @@ class SearchController extends Controller implements CrawlConstants function initializeUserAndDefaultActivity(&$data) { $arg = false; - if(!isset($_REQUEST['a']) || !in_array($_REQUEST['a'], + if(!isset($_REQUEST['a']) || !in_array($_REQUEST['a'], $this->activities)) { $activity = "query"; } else { @@ -345,12 +345,12 @@ class SearchController extends Controller implements CrawlConstants /** * Determines which crawl or mix timestamp should be in use for this - * query. It also determines info and returns associated with this + * query. It also determines info and returns associated with this * timestamp. * * @param bool $web_flag whether this is a web based query or one from * the search API - * @param int and so should validate against list of known crawls or an + * @param int and so should validate against list of known crawls or an * internal (say network) query that doesn't require validation * (faster without). * @param array &$data that will eventually be sent to the view. We set @@ -827,7 +827,7 @@ class SearchController extends Controller implements CrawlConstants } $time = time(); $rss_feeds = $this->sourceModel->getMediaSources("rss"); - if(!$rss_feeds || count($rss_feeds) == 0) { + if(!$rss_feeds || count($rss_feeds) == 0) { $data["LOG_MESSAGES"] = "No news update as no news feeds."; return; @@ -864,8 +864,8 @@ class SearchController extends Controller implements CrawlConstants /* every 3 hours everything older than a week and rebuild index do this every four hours so news articles tend to stay in order */ - if($delta > 3 * SourceModel::ONE_HOUR && - $start_delta > SourceModel::ONE_HOUR/12 && + if($delta > 3 * SourceModel::ONE_HOUR && + $start_delta > SourceModel::ONE_HOUR/12 && $lock_delta > SourceModel::TWO_MINUTES) { $this->cronModel->updateCronTime("news_lock"); $this->cronModel->updateCronTime("news_start_delete", true); @@ -881,10 +881,10 @@ class SearchController extends Controller implements CrawlConstants } $update_cron_time = $this->cronModel->getCronTime("news_update"); $try_cron_time = $this->cronModel->getCronTime("news_try_again"); - + $delta = $time - max($update_cron_time, $try_cron_time); // each 15 minutes try to re-get feeds that have no items - if((($delta > SourceModel::ONE_HOUR/4 && + if((($delta > SourceModel::ONE_HOUR/4 && $delta < SourceModel::ONE_HOUR) || $delta == 0) && $lock_delta > SourceModel::TWO_MINUTES) { $this->cronModel->updateCronTime("news_lock"); @@ -895,7 +895,7 @@ class SearchController extends Controller implements CrawlConstants $this->cronModel->saveCronTable(); return; } - + $delta = $time - $update_cron_time; // every hour get items from twenty feeds whose newest items are oldest if(($delta > SourceModel::ONE_HOUR || $delta == 0) @@ -1233,7 +1233,7 @@ class SearchController extends Controller implements CrawlConstants * * @param string $url to get cached page for * @param array $ui_flags array of ui features which - * should be added to the cache page. For example, "highlight" + * should be added to the cache page. For example, "highlight" * would way search terms should be highlighted, "history" * says add history navigation for all copies of this cache page in * yioop system. @@ -1259,7 +1259,7 @@ class SearchController extends Controller implements CrawlConstants * * @param string $url the url of the page to find the cached version of * @param array $ui_flags array of ui features which - * should be added to the cache page. For example, "highlight" + * should be added to the cache page. For example, "highlight" * would say search terms should be highlighted, "history" * says add history navigation for all copies of this cache page in * yioop system. "summaries" says add a toggle headers and extracted @@ -1367,7 +1367,7 @@ class SearchController extends Controller implements CrawlConstants $this->crawlModel->index_name = $crawl_time; $crawl_item = $this->crawlModel->getCrawlItem($url, $queue_servers); // A crawl item is able to override the default UI_FLAGS - if(isset($crawl_item[self::UI_FLAGS]) && + if(isset($crawl_item[self::UI_FLAGS]) && is_string($crawl_item[self::UI_FLAGS])) { $ui_flags = explode(",", $crawl_item[self::UI_FLAGS]); } @@ -1536,7 +1536,7 @@ class SearchController extends Controller implements CrawlConstants * in Yioop system * @param string $terms from orginal query responsible for cache request * @param array $ui_flags array of ui features which - * should be added to the cache page. For example, "highlight" + * should be added to the cache page. For example, "highlight" * would way search terms should be highlighted, "history" * says add history navigation for all copies of this cache page in * yioop system. @@ -1616,9 +1616,9 @@ class SearchController extends Controller implements CrawlConstants } else { $summary_toggle_node = $first_child; } - if(isset($cache_item[self::KEYWORD_LINKS]) && + if(isset($cache_item[self::KEYWORD_LINKS]) && count($cache_item[self::KEYWORD_LINKS]) > 0) { - $keyword_node = $this->createDomBoxNode($dom, $text_align, + $keyword_node = $this->createDomBoxNode($dom, $text_align, "zIndex: 1"); $text_node = $dom->createTextNode("Z@key_links@Z"); $keyword_node->appendChild($text_node); @@ -1631,7 +1631,7 @@ class SearchController extends Controller implements CrawlConstants } if(in_array("version", $ui_flags)) { - $version_node = + $version_node = $this->createDomBoxNode($dom, $text_align, "zIndex: 1"); $textNode = $dom->createTextNode( tl('search_controller_cached_version', "Z@url@Z", $date)); @@ -1646,7 +1646,7 @@ class SearchController extends Controller implements CrawlConstants //UI for showing history if(in_array("history", $ui_flags)) { - $history_node = $this->historyUI($crawl_time, $all_crawl_times, + $history_node = $this->historyUI($crawl_time, $all_crawl_times, $version_node, $dom, $terms, $hist_ui_open, $url); } else { $history_node = $dom->createElement('div'); @@ -1714,7 +1714,7 @@ class SearchController extends Controller implements CrawlConstants } /** - * Creates the toggle link and hidden div for extracted header and + * Creates the toggle link and hidden div for extracted header and * summary element on cache pages * * @param DOMDocument $dom used to create new nodes to add to body object @@ -1729,13 +1729,13 @@ class SearchController extends Controller implements CrawlConstants $summary_string, $cache_item) { $first_child = $body->firstChild; - $summaryNode = $this->createDomBoxNode($dom, $text_align, + $summaryNode = $this->createDomBoxNode($dom, $text_align, "display:none;", 'pre'); $summaryNode->setAttributeNS("","id", "summary-page-id"); $summaryNode = $body->insertBefore($summaryNode, $first_child); if(isset($cache_item[self::HEADER])) { - $summary_string = $cache_item[self::HEADER]."\n". + $summary_string = $cache_item[self::HEADER]."\n". $summary_string; } $textNode = $dom->createTextNode($summary_string); @@ -1766,7 +1766,7 @@ class SearchController extends Controller implements CrawlConstants } /** - * Creates a bordered tag (usually div) in which to put meta content on a + * Creates a bordered tag (usually div) in which to put meta content on a * page when it is displayed * * @param DOMDocument $dom representing cache page @@ -2110,4 +2110,4 @@ class SearchController extends Controller implements CrawlConstants $node->appendChild($script); } } -?> +?> \ No newline at end of file diff --git a/examples/search_api.php b/examples/search_api.php index fdae973b8..14cdd8113 100644 --- a/examples/search_api.php +++ b/examples/search_api.php @@ -64,20 +64,27 @@ if(!PROFILE) { * but a crawl into the WORK_DIRECTORY and that would be used to make the * query. */ -if(!file_exists(BASE_DIR."/examples/Archive1317414322.zip") || - !file_exists(BASE_DIR."/examples/IndexData1317414322.zip")) { +$archive = BASE_DIR."/examples/Archive1317414322.zip"; +$index_archive = BASE_DIR."/examples/IndexData1317414322.zip"; +$extract_folder = CRAWL_DIR."/cache"; +if(!file_exists($archive) || + !file_exists($index_archive)) { echo "\nSearch API test index doesn't exist, so can't run demo\n\n"; exit(); } -$zip = new ZipArchive(); -$zipH = $zip->open("Archive1317414322.zip"); -$zip->extractTo(CRAWL_DIR."/cache"); -$zip->close(); -$zipH = $zip->open("IndexData1317414322.zip"); -$zip->extractTo(CRAWL_DIR."/cache"); -$zip->close(); - +if(class_exists("ZipArchive")) { + $zip = new ZipArchive(); + $zipH = $zip->open($archive); + $zip->extractTo($extract_folder); + $zip->close(); + $zipH = $zip->open($index_archive); + $zip->extractTo($extract_folder); + $zip->close(); +} else { + exec("unzip $archive -d $extract_folder"); + exec("unzip $index_archive -d $extract_folder"); +} /** * The next block of code till +++++ is needed only if you want diff --git a/index.php b/index.php index 269fc785d..71b2133d0 100755 --- a/index.php +++ b/index.php @@ -120,7 +120,9 @@ if (function_exists('lcfirst') === false) { * @return string the lower cased string */ function lcfirst( $str ) - { return (string)(strtolower(substr($str,0,1)).substr($str,1));} + { + return (string)(strtolower(substr($str, 0, 1)).substr($str, 1)); + } } $available_controllers = array( "admin", "archive", "cache", "crawl", diff --git a/lib/archive_bundle_iterators/database_bundle_iterator.php b/lib/archive_bundle_iterators/database_bundle_iterator.php index d3fefebe7..b3ea0ee55 100644 --- a/lib/archive_bundle_iterators/database_bundle_iterator.php +++ b/lib/archive_bundle_iterators/database_bundle_iterator.php @@ -121,7 +121,7 @@ class DatabaseBundleIterator extends ArchiveBundleIterator $ini = parse_ini_file("{$this->iterate_dir}/arc_description.ini"); $this->dbinfo = array("DBMS" => DBMS, "DB_HOST" => DB_HOST, - "DB_NAME" => DB_NAME, "DB_USER" => DB_USER, + "DB_NAME" => DB_NAME, "DB_USER" => DB_USER, "DB_PASSWORD" => DB_PASSWORD); foreach($this->dbinfo as $key => $value) { @@ -273,7 +273,7 @@ class DatabaseBundleIterator extends ArchiveBundleIterator /** * Restores the internal state from the file iterate_status.txt in the * result dir such that the next call to nextPages will pick up from just - * after the last checkpoint. + * after the last checkpoint. * * @return array the data serialized when saveCheckpoint was called @@ -287,4 +287,4 @@ class DatabaseBundleIterator extends ArchiveBundleIterator return $info; } } -?> +?> \ No newline at end of file diff --git a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php index 635af5193..910b92ed2 100644 --- a/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php +++ b/lib/archive_bundle_iterators/mediawiki_bundle_iterator.php @@ -223,7 +223,7 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator array("/{{Redirect2?\|([^{}\|]+)\|([^{}\|]+)\|([^{}\|]+)}}/i", "<div class='indent'>\"$1\". ($2 →<a href=\"". $base_address."$3\">$3</a>)</div>"), - array("/{{Redirect\|([^{}\|]+)}}/i", + array("/{{Redirect\|([^{}\|]+)}}/i", "<div class='indent'>\"$1\". (<a href=\"". $base_address. "$1_(disambiguation)\">$1???</a>)</div>"), array("/#REDIRECT:\s+\[\[(.+?)\]\]/", @@ -234,7 +234,7 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator "<a href=\"{$base_address}$1\">$1</a>"), array("/\[(http[^\s\]]+)\s+(.+?)\]/s", "[<a href=\"$1\">$2</a>]"), - array("/\[(http[^\]\s]+)\s*\]/","(<a href=\"$1\">→</a>)"), + array("/\[(http[^\]\s]+)\s*\]/","(<a href=\"$1\">→</a>)"), array("/'''''(.+?)'''''/s", "<b><i>$1</i></b>"), array("/'''(.+?)'''/s", "<b>$1</b>"), array("/''(.+?)''/s", "<i>$1</i>"), @@ -380,7 +380,7 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator $pre_page = preg_replace_callback('/(\A|\n){\|(.*?)\n\|}/s', "makeTableCallback", $pre_page); $pre_page = preg_replace($this->matches, $this->replaces,$pre_page); - $pre_page = preg_replace("/{{Other uses}}/i", + $pre_page = preg_replace("/{{Other uses}}/i", "<div class='indent'>\"$1\". (<a href='". $site[self::URL]. "_(disambiguation)'>$pre_url</a>)</div>", $pre_page); @@ -459,7 +459,7 @@ class MediaWikiArchiveBundleIterator extends TextArchiveBundleIterator if(in_array($field, $wiki_fields)) { $value = preg_replace($this->matches, $this->replaces, $value); - $value = strip_tags($value, + $value = strip_tags($value, '<a><b><i><span><img>'); } $ref_data[$field] = $value; @@ -685,4 +685,4 @@ function fixLinksCallback($matches) return $out; } -?> +?> \ No newline at end of file diff --git a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php index a7d749572..405231a81 100644 --- a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php +++ b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php @@ -166,7 +166,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator * How many bytes at a time should be read from the current archive * file into the buffer file. 8192 = BZip2BlockIteraror::BlOCK_SIZE */ - const BUFFER_SIZE = 16384000; + const BUFFER_SIZE = 16384000; /** * Estimate of the maximum size of a record stored in a text archive @@ -218,7 +218,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator exit(); } if($this->iterate_dir != false) { - foreach(glob("{$this->iterate_dir}/*.$extension", GLOB_BRACE) + foreach(glob("{$this->iterate_dir}/*.$extension", GLOB_BRACE) as $filename) { $this->partitions[] = $filename; } @@ -405,7 +405,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator * @param bool $no_process if true then just return page string found * not any additional meta data. * @return mixed associative array for doc or just string of doc - * + * */ function nextPage($no_process = false) { @@ -414,7 +414,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator while((preg_match($this->delimiter, $this->buffer, $matches, PREG_OFFSET_CAPTURE)) != 1) { $block = $this->getFileBlock(); - if(!$block || + if(!$block || !$this->checkFileHandle() || $this->checkEof()) { return NULL; } @@ -471,7 +471,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator } /** - * Acts as gzread($num_bytes, $archive_file), hiding the fact that + * Acts as gzread($num_bytes, $archive_file), hiding the fact that * buffering of the archive_file is being done to a buffer file * * @param int $num_bytes to read from archive file @@ -489,7 +489,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator } /** - * Acts as gzgets(), hiding the fact that + * Acts as gzgets(), hiding the fact that * buffering of the archive_file is being done to a buffer file * * @return string from archive file up to next line ending or eof @@ -540,7 +540,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator if($this->compression == "plain") { $success = fseek($this->fh, $seek_pos); } - if($success == -1 || !$this->checkFileHandle() + if($success == -1 || !$this->checkFileHandle() || $this->checkEof()) { return false; } if(is_resource($this->buffer_fh)) { fclose($this->buffer_fh); @@ -551,7 +551,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator case 'bzip2': $buffer = ""; while(strlen($buffer) < $padded_buffer_size) { - while(!is_string($block = + while(!is_string($block = $this->bz2_iterator->nextBlock())) { if($this->bz2_iterator->eof()) { break 2; @@ -814,4 +814,4 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator return array($tag_info, $tag); } } -?> +?> \ No newline at end of file diff --git a/lib/archive_bundle_iterators/warc_archive_bundle_iterator.php b/lib/archive_bundle_iterators/warc_archive_bundle_iterator.php index f6fb46051..b27aba6ed 100644 --- a/lib/archive_bundle_iterators/warc_archive_bundle_iterator.php +++ b/lib/archive_bundle_iterators/warc_archive_bundle_iterator.php @@ -41,7 +41,7 @@ require_once BASE_DIR. /** * Used to iterate through the records of a collection of warc files stored in - * a WebArchiveBundle folder. Warc is the newer file format of the + * a WebArchiveBundle folder. Warc is the newer file format of the * Internet Archive and other for digital preservation: * http://www.digitalpreservation.gov/formats/fdd/fdd000236.shtml * http://archive-access.sourceforge.net/warc/ @@ -101,8 +101,8 @@ class WarcArchiveBundleIterator extends TextArchiveBundleIterator } while(!in_array($page_info['warc-type'], $indexable_records) || substr($page_info[self::URL], 0, 4) == 'dns:'); //ignore warcinfo, request, metadata, revisit, etc. records - if($no_process) { - return $header_and_page; + if($no_process) { + return $header_and_page; } unset($page_info['line']); unset($page_info['warc-type']); @@ -136,7 +136,7 @@ class WarcArchiveBundleIterator extends TextArchiveBundleIterator function getWarcHeaders() { $warc_headers = array(); - $warc_fields = array( 'warc-type' => 'warc-type', + $warc_fields = array( 'warc-type' => 'warc-type', 'warc-target-uri' => self::URL, 'warc-date' => self::TIMESTAMP, 'warc-ip-address' => self::IP_ADDRESSES, 'content-length' => self::SIZE, 'warc-record-id' => self::WARC_ID, @@ -166,4 +166,4 @@ class WarcArchiveBundleIterator extends TextArchiveBundleIterator return $warc_headers; } } -?> +?> \ No newline at end of file diff --git a/lib/crawl_daemon.php b/lib/crawl_daemon.php index d595ebb57..95c169414 100644 --- a/lib/crawl_daemon.php +++ b/lib/crawl_daemon.php @@ -153,7 +153,7 @@ class CrawlDaemon implements CrawlConstants for($i = 3; $i < count($argv); $i++) { $options .= " ".$argv[$i]; } - $subname = (!isset($argv[2]) || $argv[2] == 'none') ? + $subname = (!isset($argv[2]) || $argv[2] == 'none') ? 'none' :self::$subname; $name_prefix = (isset($argv[3])) ? $argv[3] : self::$subname; $name_string = CrawlDaemon::getNameString($name,$name_prefix); @@ -351,4 +351,4 @@ class CrawlDaemon implements CrawlConstants } } - ?> + ?> \ No newline at end of file diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index def778f8b..f4b0e582b 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -524,7 +524,7 @@ class GroupIterator extends IndexBundleIterator $max = ($max < $current_rank ) ? $current_rank : $max; $alpha = $relevance_boost * $domain_weights[$hash_host]; $sum_score += $alpha * $hash_page[self::DOC_RANK]; - + $sum_rank += $alpha * $hash_page[self::DOC_RANK]; $sum_relevance += $alpha * $hash_page[self::RELEVANCE]; $max_proximity = max($max_proximity, @@ -599,4 +599,4 @@ class GroupIterator extends IndexBundleIterator } } -?> +?> \ No newline at end of file diff --git a/lib/index_bundle_iterators/network_iterator.php b/lib/index_bundle_iterators/network_iterator.php index 2686d8137..36ea11604 100644 --- a/lib/index_bundle_iterators/network_iterator.php +++ b/lib/index_bundle_iterators/network_iterator.php @@ -119,7 +119,7 @@ class NetworkIterator extends IndexBundleIterator * archive bundles that we look in for results * @param array $filter an array of hashes of domains to filter from * results - * @param string $save_timestamp if this timestamp is nonzero, then when + * @param string $save_timestamp if this timestamp is nonzero, then when * making queries to separate machines the save_timestamp is sent so * the queries on those machine can make savepoints. Note the * format of save_timestamp is timestamp-query_part where query_part @@ -317,7 +317,7 @@ class NetworkIterator extends IndexBundleIterator if(!isset($sites[$index])) { $sites[$index] = array(); } - $tmp = urlencode(print_r($sites[$index], + $tmp = urlencode(print_r($sites[$index], true)); $title = 'URL not set'; if(trim($tmp) == "") { @@ -353,4 +353,4 @@ class NetworkIterator extends IndexBundleIterator } } - ?> + ?> \ No newline at end of file diff --git a/lib/index_dictionary.php b/lib/index_dictionary.php index 70187feef..5b3191b15 100644 --- a/lib/index_dictionary.php +++ b/lib/index_dictionary.php @@ -726,7 +726,7 @@ class IndexDictionary implements CrawlConstants $ws = substr($word_string, $word_key_len); if($extract) { $tmp = IndexShard::getWordInfoFromString($ws, true); - if($tmp[3] < $max_entry_count && + if($tmp[3] < $max_entry_count && $previous_generation != $tmp[0]) { array_unshift($info, $tmp); $previous_generation = $tmp[0]; @@ -841,4 +841,4 @@ class IndexDictionary implements CrawlConstants } - ?> + ?> \ No newline at end of file diff --git a/lib/indexing_plugins/recipe_plugin.php b/lib/indexing_plugins/recipe_plugin.php index c93b800a5..fab9d02ba 100644 --- a/lib/indexing_plugins/recipe_plugin.php +++ b/lib/indexing_plugins/recipe_plugin.php @@ -116,7 +116,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants $xpath = new DOMXPath($dom); $recipes_per_page = $xpath->evaluate( - /*allr, f.com, brec, fnet*/ + /*allr, f.com, brec, fnet*/ "/html//ul[@class = 'ingredient-wrap'] | /html//*[@class = 'pod ingredients'] | /html//*[@id='recipe_title'] | @@ -127,7 +127,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants if(is_object($recipes_per_page) && $recipes_per_page->length != 0) { $recipes_count = $recipes_per_page->length; $titles = $xpath->evaluate( - /* allr, f.com, brec, fnet */ + /* allr, f.com, brec, fnet */ "/html//*[@id = 'itemTitle']| /html//h1[@class = 'fn'] | /html//*[@id='recipe_title'] | @@ -197,7 +197,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants while($more_docs) { $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name); - if(isset($results["PAGES"]) && + if(isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0 ) { $raw_recipes = array_merge($raw_recipes, $results["PAGES"]); } @@ -675,9 +675,9 @@ class Tree } } } - $i++; + $i++; } - return $cluster; + return $cluster; } /** diff --git a/lib/page_rule_parser.php b/lib/page_rule_parser.php index bb038e1f6..589144be1 100644 --- a/lib/page_rule_parser.php +++ b/lib/page_rule_parser.php @@ -47,7 +47,7 @@ require_once BASE_DIR."/lib/crawl_constants.php"; * and does a function call to manipulate that page. Right now the supported * commands are to unset that field value, to add the field and field value to * the META_WORD array for the page and to split the field on comma, view this - * as a search keywords => link text association, and add this the + * as a search keywords => link text association, and add this the * KEYWORD_LINKS array. * These have the syntax: * unset(field) @@ -109,7 +109,7 @@ class PageRuleParser implements CrawlConstants $end = '(?:\n|\Z)'; $substitution = '(/[^/\n]+/)([^/\n]*)/'; $command = '(\w+)\((\w+)\)'; - $rule = + $rule = "@(?:$command$blank*($comment)?$end". "|$blank*($literal)$blank*($assignment)$blank*". "((".$quote_string.")|($literal)|($substitution))". @@ -117,7 +117,7 @@ class PageRuleParser implements CrawlConstants $matches = array(); preg_match_all($rule, $page_rules, $matches); $rule_trees = array(); - if(!isset($matches[0]) || + if(!isset($matches[0]) || ($num_rules = count($matches[0])) == 0) { return $rule_trees; } for($i = 0; $i < $num_rules; $i++) { $tree = array(); @@ -177,7 +177,7 @@ class PageRuleParser implements CrawlConstants function executeFunctionRule($tree, &$page_data) { $allowed_functions = array("unset" => "unsetVariable", - "addMetaWord" => "addMetaWord", + "addMetaWord" => "addMetaWord", "addKeywordLink" => "addKeywordLink"); if(in_array($tree['func_call'], array_keys($allowed_functions))) { $func = $allowed_functions[$tree['func_call']]; @@ -238,7 +238,7 @@ class PageRuleParser implements CrawlConstants } /** - * Unsets the key $field (or the crawl constant it corresponds to) + * Unsets the key $field (or the crawl constant it corresponds to) * in $page_data. If it is a crawlconstant it doesn't unset it -- * it just sets it to the empty string * @@ -304,4 +304,4 @@ class PageRuleParser implements CrawlConstants } } -?> +?> \ No newline at end of file diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php index a11b7c6eb..cc6d66ac9 100755 --- a/lib/phrase_parser.php +++ b/lib/phrase_parser.php @@ -474,7 +474,7 @@ class PhraseParser $meta_ids[] = 'info:'.crawlHash($site[CrawlConstants::URL]); $meta_ids[] = 'code:all'; $meta_ids[] = 'code:'.$site[CrawlConstants::HTTP_CODE]; - if(UrlParser::getHost($site[CrawlConstants::URL])."/" == + if(UrlParser::getHost($site[CrawlConstants::URL])."/" == $site[CrawlConstants::URL]) { $meta_ids[] = 'host:all'; //used to count number of distinct hosts } @@ -509,7 +509,7 @@ class PhraseParser $meta_ids[] = 'link:'.crawlHash($url); } } - if(isset($site[CrawlConstants::LOCATION]) && + if(isset($site[CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0){ foreach($site[CrawlConstants::LOCATION] as $location) { $meta_ids[] = 'info:'.$location; @@ -528,7 +528,7 @@ class PhraseParser $meta_ids[] = 'media:all'; if($video_sources != array()) { - if(UrlParser::isVideoUrl($site[CrawlConstants::URL], + if(UrlParser::isVideoUrl($site[CrawlConstants::URL], $video_sources)) { $meta_ids[] = "media:video"; } else { @@ -696,4 +696,4 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け $score = $num_unsafe_terms * $unsafe_count/($len + 1); return $score; } -} +} \ No newline at end of file diff --git a/lib/processors/rss_processor.php b/lib/processors/rss_processor.php index 9fbb539fa..f850a90bc 100644 --- a/lib/processors/rss_processor.php +++ b/lib/processors/rss_processor.php @@ -234,7 +234,7 @@ class RssProcessor extends TextProcessor $xpath->registerNamespace('atom', "http://www.w3.org/2005/Atom"); $link_nodes = array( "/feed/entry" => array( "url" =>"link", "text" => "title"), - "/atom:feed/atom:entry" + "/atom:feed/atom:entry" => array( "url" =>"link", "text" => "title"), ); } @@ -310,4 +310,4 @@ class RssProcessor extends TextProcessor } -?> +?> \ No newline at end of file diff --git a/lib/utility.php b/lib/utility.php index 33eb46158..46dc937a6 100755 --- a/lib/utility.php +++ b/lib/utility.php @@ -1072,7 +1072,7 @@ function readMessage() /** * Checks if class_1 is the same as class_2 of has class_2 as a parent - * Behaves like 3 param version (last param true) of PHP is_a function + * Behaves like 3 param version (last param true) of PHP is_a function * that came into being with Version 5.3.9. * */ @@ -1081,4 +1081,4 @@ function general_is_a($class_1, $class_2) if($class_1 == $class_2) return true; return (is_a($class_1, $class_2) || is_subclass_of($class_1, $class_2)); } -?> +?> \ No newline at end of file diff --git a/locale/en-US/pages/bot.thtml b/locale/en-US/pages/bot.thtml index cd05c5b11..928772bb8 100755 --- a/locale/en-US/pages/bot.thtml +++ b/locale/en-US/pages/bot.thtml @@ -1,6 +1,6 @@ -title=Bot - -description=Describes the web crawler used with this -web site -END_HEAD_VARS +title=Bot + +description=Describes the web crawler used with this +web site +END_HEAD_VARS Please Describe Your Robot \ No newline at end of file diff --git a/locale/fr-FR/configure.ini b/locale/fr-FR/configure.ini index b18424747..0fc285b4d 100755 --- a/locale/fr-FR/configure.ini +++ b/locale/fr-FR/configure.ini @@ -864,7 +864,7 @@ manageaccount_element_old_password = "Ancien Mot de passe:" manageaccount_element_new_password = "Nouveau Mot de passe:" ; ; manageaccount_element.php line: 76 -manageaccount_element_retype_password = "Veuillez resaisir votre mot de passe: " +manageaccount_element_retype_password = "Veuillez resaisir votre mot de passe: " ; ; manageaccount_element.php line: 84 manageaccount_element_save = "" @@ -1582,4 +1582,4 @@ statistics_view_url = "" statistics_view_number_hosts = "" ; ; view.php line: 35 -view_locale_version2 = "" +view_locale_version2 = "" \ No newline at end of file diff --git a/models/crawl_model.php b/models/crawl_model.php index 9316f1a83..eca88989c 100755 --- a/models/crawl_model.php +++ b/models/crawl_model.php @@ -538,7 +538,7 @@ EOT; "disallowed_sites" => array(self::DISALLOWED_SITES, "url"), "seed_sites" => array(self::TO_CRAWL, "url"), "page_rules" => array(self::PAGE_RULES, "rule"), - "indexed_file_types" => array(self::INDEXED_FILE_TYPES, + "indexed_file_types" => array(self::INDEXED_FILE_TYPES, "extensions"), ); foreach($site_types as $type => $info) { @@ -585,7 +585,7 @@ EOT; "allowed_sites" => array(self::ALLOWED_SITES,'url'), "disallowed_sites" => array(self::DISALLOWED_SITES, 'url'), "page_rules" => array(self::PAGE_RULES, 'rule'), - "indexed_file_types" => array(self::INDEXED_FILE_TYPES, + "indexed_file_types" => array(self::INDEXED_FILE_TYPES, "extensions") ); foreach($updatable_site_info as $type => $info) { @@ -1303,4 +1303,4 @@ EOT; } } -?> +?> \ No newline at end of file diff --git a/models/locale_model.php b/models/locale_model.php index 52f50493b..dd9b1ceff 100644 --- a/models/locale_model.php +++ b/models/locale_model.php @@ -442,7 +442,7 @@ class LocaleModel extends Model if(isset($this->configure['strings'][$msg_id])) { $msg_string = $this->configure['strings'][$msg_id]; } - if($msg_string == "" && + if($msg_string == "" && isset($this->default_configure['strings'][$msg_id])) { $msg_string = $this->default_configure['strings'][$msg_id]; } @@ -549,7 +549,7 @@ class LocaleModel extends Model */ function extractMergeLocales() { - $list = $this->getLocaleList(); + $list = $this->getLocaleList(); // getLocaleList will also create any missing locale dirs $strings = $this->getTranslateStrings($this->extract_dirs, $this->extensions); @@ -698,8 +698,8 @@ EOT; /** * Computes a string of the form string_id = 'translation' for a string_id * from among translation array data in $new_configure (most preferred, - * probably come from recent web form data), $old_configure - * (probably from work dir), and $fallback_configure (probably from base + * probably come from recent web form data), $old_configure + * (probably from work dir), and $fallback_configure (probably from base * dir of Yioop instance, least preferred). * * @param array $new_configure string_id => translation pairs @@ -710,7 +710,7 @@ EOT; * has a translation for a string_id * @return string translation in format describe above */ - function updateTranslation($new_configure, $old_configure, + function updateTranslation($new_configure, $old_configure, $fallback_configure, $string_id, $default_value = "") { $translation = $string_id . ' = "'. @@ -720,8 +720,8 @@ EOT; } /** - * Translates a string_id from among translation array data in - * $new_configure (most preferred, probably come from recent web form + * Translates a string_id from among translation array data in + * $new_configure (most preferred, probably come from recent web form * data), $old_configure (probably from work dir), and $fallback_configure * (probably from base dir of Yioop instance, least preferred). * @@ -756,7 +756,7 @@ EOT; */ function isTranslated($translations, $string_id) { - return isset($translations[$string_id]) && + return isset($translations[$string_id]) && strlen($translations[$string_id]) > 0; } @@ -893,4 +893,4 @@ EOT; } } - ?> + ?> \ No newline at end of file diff --git a/models/machine_model.php b/models/machine_model.php index a5dd6d16a..c0c74630b 100644 --- a/models/machine_model.php +++ b/models/machine_model.php @@ -222,7 +222,7 @@ class MachineModel extends Model * @return string containing the last MachineController::LOG_LISTING_LEN * bytes of the log record */ - function getLog($machine_name, + function getLog($machine_name, $fetcher_num = NULL, $filter="", $is_mirror = false) { $time = time(); @@ -319,4 +319,4 @@ class MachineModel extends Model } } - ?> + ?> \ No newline at end of file diff --git a/models/model.php b/models/model.php index 010b13988..9333b338e 100755 --- a/models/model.php +++ b/models/model.php @@ -410,7 +410,7 @@ class Model implements CrawlConstants FROM TRANSLATION T, LOCALE L, TRANSLATION_LOCALE TL WHERE T.IDENTIFIER_STRING = '$string_id' AND L.LOCALE_TAG = '$locale_tag' AND - L.LOCALE_ID = TL.LOCALE_ID AND + L.LOCALE_ID = TL.LOCALE_ID AND T.TRANSLATION_ID = TL.TRANSLATION_ID LIMIT 1 EOD; $result = $this->db->execute($sql); @@ -421,4 +421,4 @@ EOD; return $string_id; } } -?> +?> \ No newline at end of file diff --git a/models/phrase_model.php b/models/phrase_model.php index 95b61b6ac..cd34df2a6 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -672,7 +672,7 @@ class PhraseModel extends ParallelModel /** * Idealistically, this function tries to guess from the query what the - * user is looking for. For now, we are just doing simple things like + * user is looking for. For now, we are just doing simple things like * when a query term is a url and rewriting it to the appropriate meta * meta word. * @@ -899,7 +899,7 @@ class PhraseModel extends ParallelModel * be used during lookup * @param string $original_query if set, the original query that corresponds * to $word_structs - * @param string $save_timestamp_name if this timestamp is not empty, then + * @param string $save_timestamp_name if this timestamp is not empty, then * save iterate position, so can resume on future queries that make * use of the timestamp. If used then $limit ignored and get next $num * docs after $save_timestamp 's previous iterate position. @@ -1119,7 +1119,7 @@ class PhraseModel extends ParallelModel * @param array &$pages of page data without text summaries * @param array &$queue_servers array of queue server to find data on * @param int $raw only lookup locations if 0 - * @param bool $groups_with_docs whether to return only groups that + * @param bool $groups_with_docs whether to return only groups that * contain at least one doc as opposed to a groups with only links * @return array pages with summaries added */ @@ -1257,7 +1257,7 @@ class PhraseModel extends ParallelModel if(!$network_flag) { $doc_iterate_hash = crawlHash("site:any"); $doc_iterate_group_hash = crawlHash("site:doc"); - if($save_timestamp_name != "") { + if($save_timestamp_name != "") { // used for archive crawls of crawl mixes $save_file = CRAWL_DIR.'/schedules/'.self::save_point. $save_timestamp_name.".txt"; @@ -1360,4 +1360,4 @@ class PhraseModel extends ParallelModel } -?> +?> \ No newline at end of file diff --git a/models/profile_model.php b/models/profile_model.php index f05969292..3e953e8ad 100644 --- a/models/profile_model.php +++ b/models/profile_model.php @@ -281,19 +281,19 @@ EOT; USER_NAME VARCHAR(16) UNIQUE, PASSWORD VARCHAR(16))", "CREATE TABLE USER_SESSION(USER_ID INTEGER PRIMARY KEY, SESSION VARCHAR(4096))", - "CREATE TABLE TRANSLATION (TRANSLATION_ID INTEGER PRIMARY KEY + "CREATE TABLE TRANSLATION (TRANSLATION_ID INTEGER PRIMARY KEY $auto_increment, IDENTIFIER_STRING VARCHAR(512) UNIQUE)", "CREATE TABLE LOCALE(LOCALE_ID INTEGER PRIMARY KEY $auto_increment, LOCALE_TAG VARCHAR(16), LOCALE_NAME VARCHAR(256), WRITING_MODE CHAR(5))", - "CREATE TABLE TRANSLATION_LOCALE (TRANSLATION_ID INTEGER, + "CREATE TABLE TRANSLATION_LOCALE (TRANSLATION_ID INTEGER, LOCALE_ID INTEGER, TRANSLATION VARCHAR(4096) )", "CREATE TABLE ROLE (ROLE_ID INTEGER PRIMARY KEY $auto_increment, NAME VARCHAR(512))", "CREATE TABLE ROLE_ACTIVITY (ROLE_ID INTEGER, ACTIVITY_ID INTEGER)", - "CREATE TABLE ACTIVITY (ACTIVITY_ID INTEGER PRIMARY KEY - $auto_increment, TRANSLATION_ID INTEGER, + "CREATE TABLE ACTIVITY (ACTIVITY_ID INTEGER PRIMARY KEY + $auto_increment, TRANSLATION_ID INTEGER, METHOD_NAME VARCHAR(256))", "CREATE TABLE USER_ROLE (USER_ID INTEGER, ROLE_ID INTEGER)", "CREATE TABLE CURRENT_WEB_INDEX (CRAWL_TIME INT(11) )", @@ -488,4 +488,4 @@ EOT; } } -?> +?> \ No newline at end of file diff --git a/models/source_model.php b/models/source_model.php index 89727f5b7..b95af9d0f 100644 --- a/models/source_model.php +++ b/models/source_model.php @@ -101,7 +101,7 @@ class SourceModel extends Model } else { $sql .= " AND "; } - $sql .= " NOT EXISTS + $sql .= " NOT EXISTS (SELECT * FROM FEED_ITEM F WHERE F.SOURCE_NAME = M.NAME)"; } @@ -367,7 +367,7 @@ class SourceModel extends Model // maybe we're dealing with atom rather than rss $nodes = $dom->getElementsByTagName('entry'); $rss_elements = array( - "title" => "title", "description" => "summary", + "title" => "title", "description" => "summary", "link" => "link", "guid" => "id", "pubDate" => "updated"); } $max_time = min(self::MAX_EXECUTION_TIME, @@ -609,4 +609,4 @@ class SourceModel extends Model return $meta_ids; } } - ?> + ?> \ No newline at end of file diff --git a/models/user_model.php b/models/user_model.php index 775678cb2..b28168fc2 100755 --- a/models/user_model.php +++ b/models/user_model.php @@ -107,7 +107,7 @@ class UserModel extends Model if($translate) { $activities[$i]['ACTIVITY_NAME'] = $translate['ACTIVITY_NAME']; } - if(!isset($activities[$i]['ACTIVITY_NAME']) || + if(!isset($activities[$i]['ACTIVITY_NAME']) || $activities[$i]['ACTIVITY_NAME'] == "") { $activities[$i]['ACTIVITY_NAME'] = $this->translateDb( $activities[$i]['IDENTIFIER_STRING'], DEFAULT_LOCALE); @@ -281,4 +281,4 @@ class UserModel extends Model } } - ?> + ?> \ No newline at end of file diff --git a/scripts/suggest.js b/scripts/suggest.js index c32ef737d..dec3a238b 100644 --- a/scripts/suggest.js +++ b/scripts/suggest.js @@ -269,7 +269,7 @@ function correctSpelling(word) var trie_subtree; var curr_prob = 0; var candidates = known(edits1(word)); - + candidates.push(word); var corrected_word = ""; var correct_threshold = 25; @@ -749,7 +749,7 @@ function spellCheck() var spell_link = "?" + token_name + "=" + csrf_token + "&q=" +corrected_query; corrected_spell.innerHTML = "<b>" + local_strings.spell - +": <a rel='nofollow' href='" + spell_link + + +": <a rel='nofollow' href='" + spell_link + "'>" + corrected_query + "</a></b>"; } } @@ -766,4 +766,4 @@ ip_field.oncut = function(e) { setTimeout(function(){ onTypeTerm(e,ip_field); }, 0); -} +} \ No newline at end of file diff --git a/views/elements/configure_element.php b/views/elements/configure_element.php index cd7c482b9..9990c0681 100644 --- a/views/elements/configure_element.php +++ b/views/elements/configure_element.php @@ -93,7 +93,7 @@ class ConfigureElement extends Element e($data['WORK_DIRECTORY']); ?>" /> <?php }?> <input type="hidden" name="c" value="admin" /> - <input type="hidden" name="advanced" id='a-settings' value="<?php + <input type="hidden" name="advanced" id='a-settings' value="<?php e($data['advanced']); ?>" /> <input type="hidden" name="<?php e(CSRF_TOKEN); ?>" value="<?php e($data[CSRF_TOKEN]); ?>" /> @@ -105,7 +105,7 @@ class ConfigureElement extends Element </div> <h2><?php e(tl('configure_element_profile_settings'))?></h2> <?php if($data['PROFILE']) { ?> - <div class="top-margin">[<a href="javascript:toggleAdvance()"><?php + <div class="top-margin">[<a href="javascript:toggleAdvance()"><?php e(tl('configure_element_toggle_advanced')); ?></a>]</div> <?php } ?> <div class="bold"> @@ -273,4 +273,4 @@ class ConfigureElement extends Element <?php } } -?> +?> \ No newline at end of file diff --git a/views/elements/pageoptions_element.php b/views/elements/pageoptions_element.php index 9da4a0265..df9e955a1 100644 --- a/views/elements/pageoptions_element.php +++ b/views/elements/pageoptions_element.php @@ -97,8 +97,8 @@ class PageOptionsElement extends Element ?></div> <div class="top-margin"><b><label for="cache-pages"><?php e(tl('pageoptions_element_save_cache'))?> - </label><input - id='cache-pages' type="checkbox" name="cache_pages" + </label><input + id='cache-pages' type="checkbox" name="cache_pages" value="true" <?php if(isset($data['CACHE_PAGES']) && $data['CACHE_PAGES']) { e("checked='checked'"); @@ -180,7 +180,7 @@ class PageOptionsElement extends Element <table class="search-page-all"><tr><td> <table class="search-page-table"> <tr> - <td><label for="wd-suggest"><?php + <td><label for="wd-suggest"><?php e(tl('pageoptions_element_wd_suggest')); ?></label></td> <td><input id='wd-suggest' type="checkbox" name="WORD_SUGGEST" value="true" @@ -188,7 +188,7 @@ class PageOptionsElement extends Element $data['WORD_SUGGEST']){ e("checked='checked'");}?> /></td></tr> - <tr><td><label for="subsearch-link"><?php + <tr><td><label for="subsearch-link"><?php e(tl('pageoptions_element_subsearch_link'));?></label></td><td> <input id='subsearch-link' type="checkbox" name="SUBSEARCH_LINK" value="true" @@ -197,7 +197,7 @@ class PageOptionsElement extends Element e("checked='checked'");}?> /></td> </tr> - <tr><td><label for="signin-link"><?php + <tr><td><label for="signin-link"><?php e(tl('pageoptions_element_signin_link')); ?></label></td><td> <input id='signin-link' type="checkbox" name="SIGNIN_LINK" value="true" @@ -273,7 +273,7 @@ class PageOptionsElement extends Element <h2><?php e(tl('pageoptions_element_test_page'))?></h2> <div class="top-margin"><b><label for="page-type"><?php e(tl('pageoptions_element_page_type'))?></label></b> - <?php + <?php $types = $data['MIME_TYPES']; $this->view->optionsHelper->render("page-type", "page_type", array_combine($types, $types), @@ -286,7 +286,7 @@ class PageOptionsElement extends Element </div> - <div class="center slight-pad"><button class="button-box" + <div class="center slight-pad"><button class="button-box" id="page-button" type="submit"><?php if($data['test_options_active'] == "") { e(tl('pageoptions_element_save_options')); @@ -315,7 +315,7 @@ class PageOptionsElement extends Element e("<h3>".tl('pageoptions_element_extracted_metas')."</h3>"); e("<pre>\n{$data['EXTRACTED_META_WORDS']}\n</pre>"); } ?> - <?php + <?php } ?> </div> </div> @@ -359,4 +359,4 @@ class PageOptionsElement extends Element <?php } } -?> +?> \ No newline at end of file diff --git a/views/machinestatus_view.php b/views/machinestatus_view.php index 8dab2bec4..6fdb94656 100644 --- a/views/machinestatus_view.php +++ b/views/machinestatus_view.php @@ -67,7 +67,7 @@ class MachinestatusView extends View } else { ?> <div class="box"> - <h3 class="nomargin"><?php + <h3 class="nomargin"><?php e(tl('machinestatus_view_news_updater')); $log_url = $base_url ."log&name=news"; ?></h3> @@ -82,7 +82,7 @@ class MachinestatusView extends View <td><?php $this->optionsHelper->render("news-mode", "news_mode", $data['NEWS_MODES'], $data['NEWS_MODE'], true);?> </td> - <td>[<a href="<?php e($log_url);?>"><?php + <td>[<a href="<?php e($log_url);?>"><?php e(tl('machinestatus_view_log'));?></a>]</td> </tr></table> </form> @@ -179,4 +179,4 @@ class MachinestatusView extends View } } } -?> +?> \ No newline at end of file