viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
<?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009 - 2018 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * END LICENSE * * @author Chris Pollett chris@pollett.org * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009 - 2018 * @filesource */ namespace seekquarry\yioop\controllers\components; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; use seekquarry\yioop\library\classifiers\Classifier; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\CrawlDaemon; use seekquarry\yioop\library\PageRuleParser; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\UrlParser; use seekquarry\yioop\library\processors as P; use seekquarry\yioop\library\processors\PageProcessor; /** * This component is used to provide activities for the admin controller * related to configuring and performing a web or archive crawl * * @author Chris Pollett */ class CrawlComponent extends Component implements CrawlConstants { /** * Used to handle the manage crawl activity. * * This activity allows new crawls to be started, statistics about old * crawls to be seen. It allows a user to stop the current crawl or * restart an old crawl. It also allows a user to configure the options * by which a crawl is conducted * * @return array $data information and statistics about crawls in the * system as well as status messages on performing a given sub activity */ public function manageCrawls() { $parent = $this->parent; $crawl_model = $parent->model("crawl"); $possible_arguments = ["delete", "index", "options", "queryStats", "resume", "start", "statistics", "stop"]; $data["ELEMENT"] = "managecrawls"; $data['SCRIPT'] = "doUpdate();"; $request_fields = ['start_row', 'num_show', 'end_row']; $flag = 0; foreach ($request_fields as $field) { $data[strtoupper($field)] = isset($_REQUEST[$field]) ? max(0, $parent->clean($_REQUEST[$field], 'int')) : (isset($data['NUM_SHOW']) ? $data['NUM_SHOW'] : $flag * C\DEFAULT_ADMIN_PAGING_NUM); $flag = 1; } if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) { $machine_urls = $parent->model("machine")->getQueueServerUrls(); $num_machines = count($machine_urls); if ($num_machines < 1 || ($num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0]))) { $machine_urls = null; } switch ($_REQUEST['arg']) { case "delete": if (isset($_REQUEST['timestamp'])) { $timestamp = substr($parent->clean( $_REQUEST['timestamp'], "int"), 0, C\TIMESTAMP_LEN); $crawl_model->deleteCrawl($timestamp, $machine_urls); return $parent->redirectWithMessage( tl('crawl_component_delete_crawl_success'), $request_fields); } else { return $parent->redirectWithMessage( tl('crawl_component_delete_crawl_fail'), $request_fields); } break; case "index": $timestamp = substr($parent->clean($_REQUEST['timestamp'], "int"), 0, C\TIMESTAMP_LEN); $crawl_model->setCurrentIndexDatabaseName($timestamp); return $parent->redirectWithMessage( tl('crawl_component_set_index'), $request_fields); case "options": $this->editCrawlOption($data, $machine_urls); break; case "queryStats": $data["ELEMENT"] = "querystats"; $data["leftorright"] = (L\getLocaleDirection() == 'ltr') ? "right": "left"; $impression_model = $parent->model("impression"); $periods = [C\ONE_HOUR, C\ONE_DAY, C\ONE_MONTH, C\ONE_YEAR, C\FOREVER]; $filter = (empty($_REQUEST['filter'])) ? "" : $this->clean($_REQUEST['filter'], 'string'); $data['FILTER'] = $filter; foreach ($periods as $period) { $data["STATISTICS"][$period] = $impression_model->getStatistics(C\QUERY_IMPRESSION, $period, $filter); } /* Add Differential Privacy for query statistics if enabled */ if (C\DIFFERENTIAL_PRIVACY) { $i = 0; foreach ($periods as $period) { if (!empty($data["STATISTICS"][$period])) { foreach ($data['STATISTICS'][$period] as $item_name =>$item_data) { $view_stat = $impression_model->getImpressionStat( $item_data[0]['ID'], C\QUERY_IMPRESSION, $period); $fuzzy_views = $view_stat[1]; if ($view_stat[0] != $item_data[0][ 'NUM_VIEWS'] || $tmp_data[$item_name][$i - 1] > $fuzzy_views) { $fuzzy_views = $parent->addDifferentialPrivacy( $item_data[0]['NUM_VIEWS']); /* Make sure each time period's fuzzified view is at least as large as previous time period's value */ if ($i > 0) { if ($tmp_data[$item_name][$i-1] > $fuzzy_views) { $fuzzy_views = $tmp_data[$item_name][$i-1]; } } $impression_model->updateImpressionStat( $item_data[0]['ID'], C\QUERY_IMPRESSION, $period, $item_data[0]['NUM_VIEWS'], $fuzzy_views); } $data["STATISTICS"][ $period][$item_name][0]['NUM_VIEWS'] = ($fuzzy_views == 0) ? tl('managegroups_element_no_activity') : $fuzzy_views; $tmp_data[$item_name][$i] = $fuzzy_views; } $i++; } } } break; case "resume": $crawl_params = []; $crawl_params[self::STATUS] = "RESUME_CRAWL"; $crawl_params[self::CRAWL_TIME] = substr($parent->clean($_REQUEST['timestamp'], "int"), 0, C\TIMESTAMP_LEN); $seed_info = $crawl_model->getCrawlSeedInfo( $crawl_params[self::CRAWL_TIME], $machine_urls); $this->getCrawlParametersFromSeedInfo($crawl_params, $seed_info); $crawl_params[self::TOR_PROXY] = C\TOR_PROXY; if (C\USE_PROXY) { $crawl_params[self::PROXY_SERVERS] = explode("|Z|", C\PROXY_SERVERS); } /* Write the new crawl parameters to the name server, so that it can pass them along in the case of a new archive crawl. */ $filename = C\CRAWL_DIR. "/schedules/NameServerMessages.txt"; $parent->web_site->filePutContents($filename, serialize($crawl_params)); chmod($filename, 0777); if($crawl_model->sendStartCrawlMessage($crawl_params, null, $machine_urls)) { return $parent->redirectWithMessage( tl('crawl_component_resume_crawl'), $request_fields); } return $parent->redirectWithMessage( tl('crawl_component_resume_fail'), $request_fields); case "start": $this->startCrawl($data, $machine_urls, $request_fields); return $parent->redirectWithMessage( tl('crawl_component_starting_new_crawl'), $request_fields); case "statistics": $data["ELEMENT"] = "statistics"; $data["leftorright"] = (L\getLocaleDirection() == 'ltr') ? "right": "left"; $index = (empty($_REQUEST['its'])) ? "" : substr($parent->clean($_REQUEST['its'], "string"), 0, C\TIMESTAMP_LEN); /* validate timestamp against list (some crawlers replay deleted crawls) */ if ($index) { $crawls = $crawl_model->getCrawlList(false, true, $machine_urls, true); $found_crawl = false; foreach ($crawls as $crawl) { if ($index == $crawl['CRAWL_TIME']) { $found_crawl = true; break; } } $index = ($found_crawl) ? $index : false; } if (!$index) { include(C\BASE_DIR."/error.php"); \seekquarry\yioop\library\webExit(); //bail } $data['its'] = $index; $this->crawlStatistics($data, $machine_urls); if (!empty($_REQUEST['recompute'])) { return $parent->redirectWithMessage( tl('crawl_component_recomputing_stats', $data['its'])); } break; case "stop": $crawl_param_file = C\CRAWL_DIR . "/schedules/crawl_params.txt"; if (file_exists($crawl_param_file)) { unlink($crawl_param_file); } $info = []; $info[self::STATUS] = "STOP_CRAWL"; $filename = C\CRAWL_DIR. "/schedules/NameServerMessages.txt"; $parent->web_site->filePutContents($filename, serialize($info)); $crawl_model->sendStopCrawlMessage($machine_urls); return $parent->redirectWithMessage( tl('crawl_component_stop_crawl'), $request_fields); } } return $data; } /** * Called from @see manageCrawls to start a new crawl on the machines * $machine_urls. Updates $data array with crawl start message * * @param array& $data an array of info to supply to AdminView * @param array $machine_urls string urls of machines managed by this * Yioop name server on which to perform the crawl * @param array $request_fields if start crawl fails this is a list of * request fields to preserve in the redirect message * @param array $seed_info allowed, disallowed, seed urls, etc to use in * crawl */ public function startCrawl(&$data, $machine_urls, $request_fields, $seed_info = null) { $parent = $this->parent; $crawl_model = $parent->model("crawl"); $crawl_params = []; $crawl_params[self::STATUS] = "NEW_CRAWL"; $crawl_params[self::CRAWL_TIME] = time(); $seed_info = $crawl_model->getSeedInfo(); $this->getCrawlParametersFromSeedInfo($crawl_params, $seed_info); if (isset($_REQUEST['description'])) { $description = substr( $parent->clean($_REQUEST['description'], "string"), 0, C\TITLE_LEN); } else { $description = tl('crawl_component_no_description'); } $crawl_params['DESCRIPTION'] = $description; $crawl_params[self::TOR_PROXY] = C\TOR_PROXY; if (C\USE_PROXY) { $crawl_params[self::PROXY_SERVERS] = explode("|Z|", C\PROXY_SERVERS); } $crawl_params[self::VIDEO_SOURCES] = []; $sources = $parent->model("source")->getMediaSources('video'); foreach ($sources as $source) { $url = $source['SOURCE_URL']; $url_parts = explode("{}", $url); $crawl_params[self::VIDEO_SOURCES][] = $url_parts[0]; } if (isset($crawl_params[self::INDEXING_PLUGINS]) && is_array($crawl_params[self::INDEXING_PLUGINS])) { foreach ($crawl_params[self::INDEXING_PLUGINS] as $plugin) { if ($plugin == "") {continue;} $plugin_class = C\NS_PLUGINS . $plugin."Plugin"; $plugin_obj = $parent->plugin(lcfirst($plugin)); if (method_exists($plugin_class, "loadConfiguration")) { $crawl_params[self::INDEXING_PLUGINS_DATA][$plugin] = $plugin_obj->loadConfiguration(); } } } /* Write the new crawl parameters to the name server, so that it can pass them along in the case of a new archive crawl. */ $filename = C\CRAWL_DIR. "/schedules/NameServerMessages.txt"; $parent->web_site->filePutContents($filename, serialize($crawl_params)); chmod($filename, 0777); if(!$crawl_model->sendStartCrawlMessage($crawl_params, $seed_info, $machine_urls)) { $parent->redirectWithMessage( tl('crawl_component_start_fail'), $request_fields); } } /** * Reads the parameters for a crawl from an array gotten from a crawl.ini * file * * @param array& $crawl_params parameters to write to queue_server * @param array $seed_info data from crawl.ini file */ public function getCrawlParametersFromSeedInfo(&$crawl_params, $seed_info) { $parent = $this->parent; $crawl_params[self::CRAWL_TYPE] = $seed_info['general']['crawl_type']; $crawl_params[self::CRAWL_INDEX] = (isset($seed_info['general']['crawl_index'])) ? $seed_info['general']['crawl_index'] : ''; $crawl_params[self::ARC_DIR]= (isset($seed_info['general']['arc_dir'])) ? $seed_info['general']['arc_dir'] : ''; $crawl_params[self::ARC_TYPE] = (isset($seed_info['general']['arc_type'])) ? $seed_info['general']['arc_type'] : ''; $crawl_params[self::CACHE_PAGES] = (isset($seed_info['general']['cache_pages'])) ? intval($seed_info['general']['cache_pages']) : true; $crawl_params[self::PAGE_RANGE_REQUEST] = (isset($seed_info['general']['page_range_request'])) ? intval($seed_info['general']['page_range_request']) : C\PAGE_RANGE_REQUEST; $crawl_params[self::MAX_DESCRIPTION_LEN] = (isset($seed_info['general']['max_description_len'])) ? intval($seed_info['general']['max_description_len']) : C\MAX_DESCRIPTION_LEN; $crawl_params[self::PAGE_RECRAWL_FREQUENCY] = (isset($seed_info['general']['page_recrawl_frequency'])) ? intval($seed_info['general']['page_recrawl_frequency']) : C\PAGE_RECRAWL_FREQUENCY; $crawl_params[self::TO_CRAWL] = $seed_info['seed_sites']['url']; $crawl_params[self::CRAWL_ORDER] = $seed_info['general']['crawl_order']; $crawl_params[self::RESTRICT_SITES_BY_URL] = $seed_info['general']['restrict_sites_by_url']; $crawl_params[self::ALLOWED_SITES] = isset($seed_info['allowed_sites']['url']) ? $seed_info['allowed_sites']['url'] : []; $crawl_params[self::DISALLOWED_SITES] = isset($seed_info['disallowed_sites']['url']) ? $seed_info['disallowed_sites']['url'] : []; if (isset($seed_info['indexed_file_types']['extensions'])) { $crawl_params[self::INDEXED_FILE_TYPES] = $seed_info['indexed_file_types']['extensions']; } if (isset($seed_info['general']['summarizer_option'])) { $crawl_params[self::SUMMARIZER_OPTION] = $seed_info['general']['summarizer_option']; } if (isset($seed_info['active_classifiers']['label'])) { // Note that 'label' is actually an array of active class labels. $crawl_params[self::ACTIVE_CLASSIFIERS] = $seed_info['active_classifiers']['label']; } if (isset($seed_info['active_rankers']['label'])) { // Note that 'label' is actually an array of active class labels. $crawl_params[self::ACTIVE_RANKERS] = $seed_info['active_rankers']['label']; } if (isset($seed_info['indexing_plugins']['plugins'])) { $crawl_params[self::INDEXING_PLUGINS] = $seed_info['indexing_plugins']['plugins']; } $crawl_params[self::PAGE_RULES] = isset($seed_info['page_rules']['rule']) ? $seed_info['page_rules']['rule'] : []; } /** * Called from @see manageCrawls to edit the parameters for the next * crawl (or current crawl) to be carried out by the machines * $machine_urls. Updates $data array to be supplied to AdminView * * @param array& $data an array of info to supply to AdminView * @param array $machine_urls string urls of machines managed by this * Yioop name server on which to perform the crawl */ public function editCrawlOption(&$data, $machine_urls) { $parent = $this->parent; $crawl_model= $parent->model("crawl"); $data["leftorright"] = (L\getLocaleDirection() == 'ltr') ? "right": "left"; $data["ELEMENT"] = "crawloptions"; $crawls = $crawl_model->getCrawlList(false, false, $machine_urls); $indexes = $crawl_model->getCrawlList(true, true, $machine_urls); if (isset($_SESSION['USER_ID'])) { $user = $_SESSION['USER_ID']; } else { $user = L\remoteAddress(); } $mixes = $crawl_model->getMixList($user, false); foreach ($mixes as $mix) { $tmp = []; $tmp["DESCRIPTION"] = "MIX::".$mix["NAME"]; $tmp["CRAWL_TIME"] = $mix["TIMESTAMP"]; $tmp["ARC_DIR"] = "MIX"; $tmp["ARC_TYPE"] = "MixArchiveBundle"; $indexes[] = $tmp; } $add_message = ""; $indexes_by_crawl_time = []; $update_flag = false; $data['available_options'] = [ tl('crawl_component_use_below'), tl('crawl_component_use_defaults')]; $data['available_crawl_indexes'] = []; $data['INJECT_SITES'] = ""; $data['options_default'] = tl('crawl_component_use_below'); foreach ($crawls as $crawl) { if (strlen($crawl['DESCRIPTION']) > 0 ) { $data['available_options'][$crawl['CRAWL_TIME']] = tl('crawl_component_previous_crawl')." ". $crawl['DESCRIPTION']; } } foreach ($indexes as $i => $crawl) { $data['available_crawl_indexes'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION']; $indexes_by_crawl_time[$crawl['CRAWL_TIME']] =& $indexes[$i]; } $no_further_changes = false; $seed_current = $crawl_model->getSeedInfo(); if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] == 1) { $seed_info = $crawl_model->getSeedInfo(true); if (isset( $seed_current['general']['page_range_request'])) { $seed_info['general']['page_range_request'] = $seed_current['general']['page_range_request']; } if (isset( $seed_current['general']['page_recrawl_frequency']) ) { $seed_info['general']['page_recrawl_frequency'] = $seed_current['general']['page_recrawl_frequency']; } if (isset( $seed_current['general']['max_description_len'])) { $seed_info['general']['max_description_len'] = $seed_current['general']['max_description_len']; } $update_flag = true; $no_further_changes = true; } else if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] > 1 ) { $timestamp = $parent->clean($_REQUEST['load_option'], "int"); $seed_info = $crawl_model->getCrawlSeedInfo( $timestamp, $machine_urls); if (isset( $seed_current['general']['page_range_request'])) { $seed_info['general']['page_range_request'] = $seed_current['general']['page_range_request']; } if (isset( $seed_current['general']['page_recrawl_frequency']) ) { $seed_info['general']['page_recrawl_frequency'] = $seed_current['general']['page_recrawl_frequency']; } if (isset( $seed_current['general']['max_description_len'])) { $seed_info['general']['max_description_len'] = $seed_current['general']['max_description_len']; } $update_flag = true; $no_further_changes = true; } else if (isset($_REQUEST['ts'])) { $timestamp = substr($parent->clean($_REQUEST['ts'], "int"), 0, C\TIMESTAMP_LEN); $seed_info = $crawl_model->getCrawlSeedInfo( $timestamp, $machine_urls); $data['ts'] = $timestamp; } else { $seed_info = $crawl_model->getSeedInfo(); } if (!C\DIRECT_ADD_SUGGEST && isset($_REQUEST['suggest']) && $_REQUEST['suggest'] == 'add') { $suggest_urls = $crawl_model->getSuggestSites(); if (isset($_REQUEST['ts'])) { $new_urls = []; } else { $seed_info['seed_sites']['url'][] = "#\n#". tl('crawl_component_added_urls', date('r'))."\n#"; $crawl_model->clearSuggestSites(); } foreach ($suggest_urls as $suggest_url) { $suggest_url = trim($suggest_url); if (!in_array($suggest_url, $seed_info['seed_sites']['url']) && strlen($suggest_url) > 0) { if (isset($_REQUEST['ts'])) { $new_urls[] = $suggest_url; } else { $seed_info['seed_sites']['url'][] = $suggest_url; } } } $add_message= tl('crawl_component_add_suggest'); if (isset($_REQUEST['ts'])) { $data["INJECT_SITES"] = $parent->convertArrayLines($new_urls); if ($data["INJECT_SITES"] == "") { $add_message= tl('crawl_component_no_new_suggests'); } } $update_flag = true; $no_further_changes = true; } $page_options_properties = ['indexed_file_types', 'active_classifiers', 'page_rules', 'indexing_plugins']; //these properties should be changed under page_options not here foreach ($page_options_properties as $property) { if (isset($seed_current[$property])) { $seed_info[$property] = $seed_current[$property]; } } if (!$no_further_changes && isset($_REQUEST['crawl_indexes']) && in_array($_REQUEST['crawl_indexes'], array_keys($data['available_crawl_indexes']))) { $seed_info['general']['crawl_index'] = $_REQUEST['crawl_indexes']; $index_data = $indexes_by_crawl_time[$_REQUEST['crawl_indexes']]; if (isset($index_data['ARC_DIR'])) { $seed_info['general']['arc_dir'] = $index_data['ARC_DIR']; $seed_info['general']['arc_type'] = $index_data['ARC_TYPE']; } else { $seed_info['general']['arc_dir'] = ''; $seed_info['general']['arc_type'] = ''; } $update_flag = true; } $data['crawl_index'] = (isset($seed_info['general']['crawl_index'])) ? $seed_info['general']['crawl_index'] : ''; $data['available_crawl_types'] = [self::WEB_CRAWL, self::ARCHIVE_CRAWL]; if (!$no_further_changes && isset($_REQUEST['crawl_type']) && in_array($_REQUEST['crawl_type'], $data['available_crawl_types'])) { $seed_info['general']['crawl_type'] = $_REQUEST['crawl_type']; $update_flag = true; } $data['crawl_type'] = $seed_info['general']['crawl_type']; if ($data['crawl_type'] == self::WEB_CRAWL) { $data['web_crawl_active'] = "active"; $data['archive_crawl_active'] = ""; } else { $data['archive_crawl_active'] = "active"; $data['web_crawl_active'] = ""; } $data['available_crawl_orders'] = [ self::BREADTH_FIRST => tl('crawl_component_breadth_first'), self::PAGE_IMPORTANCE => tl('crawl_component_page_importance')]; if (!$no_further_changes && isset($_REQUEST['crawl_order']) && in_array($_REQUEST['crawl_order'], array_keys($data['available_crawl_orders']))) { $seed_info['general']['crawl_order'] = $_REQUEST['crawl_order']; $update_flag = true; } $data['crawl_order'] = $seed_info['general']['crawl_order']; if (!$no_further_changes && isset($_REQUEST['posted'])) { $seed_info['general']['restrict_sites_by_url'] = (isset($_REQUEST['restrict_sites_by_url'])) ? true : false; $update_flag = true; } $data['restrict_sites_by_url'] = $seed_info['general']['restrict_sites_by_url']; $site_types = ['allowed_sites' => 'url', 'disallowed_sites' => 'url', 'seed_sites' => 'url']; foreach ($site_types as $type => $field) { if (!$no_further_changes && isset($_REQUEST[$type])) { $seed_info[$type][$field] = $parent->convertStringCleanArray( $_REQUEST[$type], $field); $update_flag = true; } if (isset($seed_info[$type][$field])) { $data[$type] = $parent->convertArrayLines( $seed_info[$type][$field]); } else { $data[$type] = ""; } } $data['TOGGLE_STATE'] = ($data['restrict_sites_by_url']) ? "checked='checked'" : ""; $data['SCRIPT'] = "setDisplay('toggle', ". "'{$data['restrict_sites_by_url']}');"; if (!isset($_REQUEST['ts'])) { $data['SCRIPT'] .= " elt('load-options').onchange = ". "function() { if (elt('load-options').selectedIndex !=". " 0) { elt('crawloptionsForm').submit(); }};"; } if ($data['crawl_type'] == CrawlConstants::WEB_CRAWL) { $data['SCRIPT'] .= "switchTab('webcrawltab', 'archivetab');"; } else { $data['SCRIPT'] .= "switchTab('archivetab', 'webcrawltab');"; } $inject_urls = []; if (isset($_REQUEST['ts']) && isset($_REQUEST['inject_sites']) && $_REQUEST['inject_sites']) { $timestamp = substr($parent->clean($_REQUEST['ts'], "string"), 0, C\TIMESTAMP_LEN); $inject_urls = $parent->convertStringCleanArray( $_REQUEST['inject_sites']); } if ($update_flag) { if (isset($_REQUEST['ts'])) { if ($inject_urls != []) { $seed_info['seed_sites']['url'][] = "#\n#". tl('crawl_component_added_urls', date('r'))."\n#"; $seed_info['seed_sites']['url'] = array_merge( $seed_info['seed_sites']['url'], $inject_urls); } $crawl_model->setCrawlSeedInfo($timestamp, $seed_info, $machine_urls); if ($inject_urls != [] && $crawl_model->injectUrlsCurrentCrawl( $timestamp, $inject_urls, $machine_urls)) { $add_message = "<br />". tl('crawl_component_urls_injected'); if (isset($_REQUEST['use_suggest']) && $_REQUEST['use_suggest']) { $crawl_model->clearSuggestSites(); } } } else { $crawl_model->setSeedInfo($seed_info); } return $parent->redirectWithMessage( tl('crawl_component_update_seed_info'). " $add_message", ["arg"]); } return $data; } /** * Called from @see manageCrawls to read in the file with statistics * information about a crawl. This file is computed by @see AnalyticsJob * * @param array& $data an array of info to supply to AdminView * @param array $machine_urls machines that are being used in crawl * Yioop name server on which to perform the crawl */ public function crawlStatistics(&$data, $machine_urls) { $parent = $this->parent; $crawl_model = $parent->model("crawl"); $pre_stats_file = C\CRAWL_DIR."/cache/pre_".self::statistics_base_name. $data['its'].".txt"; $stats_file = str_replace("pre_", "", $pre_stats_file); $data["HAS_STATISTICS"] = true; $data["STATISTICS_SCHEDULED"] = false; if (!empty($_REQUEST['recompute'])) { set_error_handler(null); @unlink($stats_file); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); } if (!file_exists($stats_file)) { $info = $crawl_model->getInfoTimestamp($data['its'], $machine_urls); if (!$info) { include(C\BASE_DIR."/error.php"); \seekquarry\yioop\library\webExit(); //bail } $info["TIMESTAMP"] = $data['its']; $data = array_merge($data, $info); $data["HAS_STATISTICS"] = false; } else { $data = array_merge($data, unserialize( $parent->web_site->fileGetContents($stats_file))); } $data['GENERAL_STATS'] = [ tl("crawl_component_description") => $data["DESCRIPTION"], tl("crawl_component_timestamp") => $data["TIMESTAMP"], tl("crawl_component_crawl_date") => date("r",$data["TIMESTAMP"]), tl("crawl_component_pages") => $data["VISITED_URLS_COUNT"], tl("crawl_component_url") => $data["COUNT"] ]; if (!$data["HAS_STATISTICS"]) { if (!empty($info)) { if (file_exists($pre_stats_file)) { $data["STATISTICS_SCHEDULED"] = true; } else { $parent->web_site->filePutContents($pre_stats_file, serialize($info)); chmod($pre_stats_file, 0777); } } return; } if (isset($data["HOST"]["DATA"]["all"])) { $data['GENERAL_STATS'][tl("crawl_component_number_hosts")] = $data["HOST"]["DATA"]["all"]; } $data["STAT_HEADINGS"] = [ tl("crawl_component_error_codes") => "CODE", tl("crawl_component_sizes") => "SIZE", tl("crawl_component_links_per_page") => "NUMLINKS", tl("crawl_component_page_date") => "MODIFIED", tl("crawl_component_dns_time") => "DNS", tl("crawl_component_download_time") => "TIME", tl("crawl_component_top_level_domain") => "SITE", tl("crawl_component_file_extension") => "FILETYPE", tl("crawl_component_media_type") => "MEDIA", tl("crawl_component_language") => "LANG", tl("crawl_component_server") => "SERVER", tl("crawl_component_os") => "OS", ]; } /** * Handles admin requests for creating, editing, and deleting classifiers. * * This activity implements the logic for the page that lists existing * classifiers, including the actions that can be performed on them. */ public function manageClassifiers() { $parent = $this->parent; $crawl_model = $parent->model("crawl"); $possible_arguments = ['createclassifier', 'editclassifier', 'finalizeclassifier', 'deleteclassifier', 'search']; $data['ELEMENT'] = 'manageclassifiers'; $data['SCRIPT'] = ''; $data['FORM_TYPE'] = ''; $search_array = []; $request_fields = ['start_row', 'num_show', 'end_row']; $machine_urls = $parent->model("machine")->getQueueServerUrls(); $num_machines = count($machine_urls); if ($num_machines < 1 || ($num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0]))) { $machine_urls = null; } $data['leftorright'] = (L\getLocaleDirection() == 'ltr') ? 'right': 'left'; $classifiers = Classifier::getClassifierList(); $start_finalizing = false; if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) { if (isset($_REQUEST['name'])) { $name = substr($parent->clean($_REQUEST['name'], 'string'), 0, C\NAME_LEN); $name = Classifier::cleanLabel($name); } else if (isset($_REQUEST['class_label'])) { $name = substr($parent->clean( $_REQUEST['class_label'], 'string'), 0, C\NAME_LEN); $name = Classifier::cleanLabel($name); } else { $name = ""; } switch ($_REQUEST['arg']) { case 'createclassifier': if (!isset($classifiers[$name])) { $classifier = new Classifier($name); Classifier::setClassifier($classifier); $classifiers[$name] = $classifier; return $parent->redirectWithMessage( tl('crawl_component_new_classifier'), $request_fields); } else { return $parent->redirectWithMessage( tl('crawl_component_classifier_exists'), $request_fields); } break; case 'deleteclassifier': $_REQUEST['arg'] = empty($_REQUEST['context']) ? 'none': 'search'; $request_fields[] = 'arg'; /* In addition to deleting the classifier, we also want to delete the associated crawl mix (if one exists) used to iterate over existing indexes in search of new training examples. */ if (isset($classifiers[$name])) { unset($classifiers[$name]); Classifier::deleteClassifier($name); $mix_name = Classifier::getCrawlMixName($name); $mix_time = $crawl_model->getCrawlMixTimestamp( $mix_name); if ($mix_time) { $crawl_model->deleteCrawlMixIteratorState( $mix_time); $crawl_model->deleteCrawlMix($mix_time); } return $parent->redirectWithMessage( tl('crawl_component_classifier_deleted'), $request_fields); } else { return $parent->redirectWithMessage( tl('crawl_component_no_classifier'), $request_fields); } break; case 'editclassifier': if (isset($classifiers[$name])) { $data['class_label'] = $name; $this->editClassifier($data, $classifiers, $machine_urls); } else { return $parent->redirectWithMessage( tl('crawl_component_no_classifier'), $request_fields); } break; case 'finalizeclassifier': /* Finalizing is too expensive to be done directly in the controller that responds to the web request. Instead, a daemon is launched to finalize the classifier asynchronously and save it back to disk when it's done. In the meantime, a flag is set to indicate the current finalizing state. */ CrawlDaemon::start("ClassifierTrainer", $name, '', -1); $classifier = $classifiers[$name]; $classifier->finalized = Classifier::FINALIZING; $start_finalizing = true; $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">". tl('crawl_component_finalizing_classifier'). '</h1>\');'; break; case 'search': $search_array = $parent->tableSearchRequestHandler($data, "manageClassifiers", ['name']); if (empty($_SESSION['LAST_SEARCH']['manageClassifiers']) || isset($_REQUEST['name'])) { $_SESSION['LAST_SEARCH']['manageClassifiers'] = $_SESSION['SEARCH']['manageClassifiers']; unset($_SESSION['SEARCH']['manageClassifiers']); } else { $default_search = true; } break; } } $data['classifiers'] = $classifiers; if ($search_array == [] || !empty($default_search)) { if (!empty($_SESSION['LAST_SEARCH']['manageClassifiers'])) { if (!empty($_REQUEST['arg']) && $_REQUEST['arg'] == 'search') { $search_array = $parent->restoreLastSearchFromSession($data, 'manageClassifiers'); } else if (!empty($_REQUEST['context'])) { $search_array = $_SESSION['LAST_SEARCH'][ 'manageClassifiers']['SEARCH_ARRAY']; $data['PAGING'] = $_SESSION['LAST_SEARCH']['manageClassifiers']['PAGING']; } } if ($search_array == []) { $search_array[] = ["name", "", "", "ASC"]; } } $parent->pagingLogic($data, 'classifiers', 'classifiers', C\DEFAULT_ADMIN_PAGING_NUM, $search_array, "", ['name' => 'class_label']); $data['reload'] = false; foreach ($classifiers as $label => $classifier) { if ($classifier->finalized == Classifier::FINALIZING) { $data['reload'] = true; break; } } if ($data['reload'] && !$start_finalizing) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">". tl('crawl_component_finalizing_classifier'). '</h1>\');'; } return $data; } /** * Handles the particulars of editing a classifier, which includes changing * its label and adding training examples. * * This activity directly handles changing the class label, but not adding * training examples. The latter activity is done interactively without * reloading the page via XmlHttpRequests, coordinated by the classifier * controller dedicated to that task. * * @param array $data data to be passed on to the view * @param array $classifiers map from class labels to their associated * classifiers * @param array $machine_urls string urls of machines managed by this * Yioop name server */ public function editClassifier(&$data, $classifiers, $machine_urls) { $parent = $this->parent; $data['ELEMENT'] = 'editclassifier'; $data['INCLUDE_SCRIPTS'] = ['classifiers']; if (!empty($_REQUEST['context']) && $_REQUEST['context']=='search') { $data['context'] = 'search'; } // We want recrawls, but not archive crawls. $crawls = $parent->model("crawl")->getCrawlList(false, true, $machine_urls); $data['CRAWLS'] = $crawls; $classifier = $classifiers[$data['class_label']]; if (isset($_REQUEST['update']) && $_REQUEST['update'] == 'update') { if (isset($_REQUEST['rename_label'])) { $new_label = substr($parent->clean($_REQUEST['rename_label'], 'string'), 0, C\NAME_LEN); $new_label = preg_replace('/[^a-zA-Z0-9_]/', '', $new_label); if (!isset($classifiers[$new_label])) { $old_label = $classifier->class_label; $classifier->class_label = $new_label; Classifier::setClassifier($classifier); Classifier::deleteClassifier($old_label); $data['class_label'] = $new_label; } else { $_REQUEST['name'] = $_REQUEST['class_label']; return $parent->redirectWithMessage( tl('crawl_component_classifier_exists'), ['arg', 'name', 'context']); } } } $data['classifier'] = $classifier; // Translations for the classification javascript. $data['SCRIPT'] .= "window.tl = {". 'crawl_component_load_failed:"'. tl('crawl_component_load_failed').'",'. 'crawl_component_loading:"'. tl('crawl_component_loading').'",'. 'crawl_component_added_examples:"'. tl('crawl_component_added_examples').'",'. 'crawl_component_label_update_failed:"'. tl('crawl_component_label_update_failed').'",'. 'crawl_component_updating:"'. tl('crawl_component_updating').'",'. 'crawl_component_acc_update_failed:"'. tl('crawl_component_acc_update_failed').'",'. 'crawl_component_na:"'. tl('crawl_component_na').'",'. 'crawl_component_no_docs:"'. tl('crawl_component_no_docs').'",'. 'crawl_component_num_docs:"'. tl('crawl_component_num_docs').'",'. 'crawl_component_in_class:"'. tl('crawl_component_in_class').'",'. 'crawl_component_not_in_class:"'. tl('crawl_component_not_in_class').'",'. 'crawl_component_skip:"'. tl('crawl_component_skip').'",'. 'crawl_component_prediction:"'. tl('crawl_component_prediction').'",'. 'crawl_component_scores:"'. tl('crawl_component_scores').'"'. '};'; /* We pass along authentication information to the client, so that it can authenticate any XmlHttpRequests that it makes in order to label documents. */ $time = strval(time()); $session = md5($time . C\AUTH_KEY); $data['SCRIPT'] .= "Classifier.initialize(". "'{$data['class_label']}',". "'{$session}',". "'{$time}');"; } /** * Handles admin request related to controlling file options to be used * in a crawl * * This activity allows a user to specify the page range size to be * be used during a crawl as well as which file types can be downloaded */ public function pageOptions() { PageProcessor::initializeIndexedFileTypes(); $parent = $this->parent; $crawl_model = $parent->model("crawl"); $profile_model = $parent->model("profile"); $data["ELEMENT"] = "pageoptions"; $data['SCRIPT'] = ""; $machine_urls = $parent->model("machine")->getQueueServerUrls(); $num_machines = count($machine_urls); if ($num_machines < 1 || ($num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0]))) { $machine_urls = null; } $data['available_options'] = [ tl('crawl_component_use_below'), tl('crawl_component_use_defaults')]; $crawls = $crawl_model->getCrawlList(false, true, $machine_urls); $data['options_default'] = tl('crawl_component_use_below'); foreach ($crawls as $crawl) { if (strlen($crawl['DESCRIPTION']) > 0 ) { $data['available_options'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION']; } } $seed_info = $crawl_model->getSeedInfo(); $data['RECRAWL_FREQS'] = [-1=>tl('crawl_component_recrawl_never'), 1=>tl('crawl_component_recrawl_1day'), 2=>tl('crawl_component_recrawl_2day'), 3=>tl('crawl_component_recrawl_3day'), 7=>tl('crawl_component_recrawl_7day'), 14=>tl('crawl_component_recrawl_14day')]; $data['SIZE_VALUES'] = [10000=>10000, 50000=>50000, 100000=>100000, 500000=>500000, 1000000=>1000000, 5000000=>5000000, 10000000=>10000000]; $data['LEN_VALUES'] = [2000=>2000, 5000=>5000, 10000=>10000, 50000=>50000, 100000=>100000, 500000=>500000, 1000000=>1000000, 5000000=>5000000, 10000000=>10000000]; $data['available_summarizers'] = [ self::BASIC_SUMMARIZER => tl('crawl_component_basic'), self::CENTROID_SUMMARIZER => tl('crawl_component_centroid'), self::CENTROID_WEIGHTED_SUMMARIZER => tl('crawl_component_centroid_weighted'), self::GRAPH_BASED_SUMMARIZER => tl('crawl_component_graph_based')]; if (!isset($seed_info["indexed_file_types"]["extensions"])) { $seed_info["indexed_file_types"]["extensions"] = PageProcessor::$indexed_file_types; } $loaded = false; if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] > 0) { if ($_REQUEST['load_option'] == 1) { $seed_loaded = $crawl_model->getSeedInfo(true); } else { $timestamp = substr($parent->clean( $_REQUEST['load_option'], "int"), 0, C\TIMESTAMP_LEN); $seed_loaded = $crawl_model->getCrawlSeedInfo( $timestamp, $machine_urls); } $copy_options = ["general" => ["page_recrawl_frequency", "page_range_request", "max_description_len", "cache_pages", 'summarizer_option'], "indexed_file_types" => ["extensions"], "indexing_plugins" => ["plugins", "plugins_data"]]; foreach ($copy_options as $main_option => $sub_options) { foreach ($sub_options as $sub_option) { if (isset($seed_loaded[$main_option][$sub_option])) { $seed_info[$main_option][$sub_option] = $seed_loaded[$main_option][$sub_option]; } } } if (isset($seed_loaded['page_rules'])) { $seed_info['page_rules'] = $seed_loaded['page_rules']; } if (isset($seed_loaded['active_classifiers'])) { $seed_info['active_classifiers'] = $seed_loaded['active_classifiers']; } else { $seed_info['active_classifiers'] = []; $seed_info['active_classifiers']['label'] = []; } $loaded = true; } else { $seed_info = $crawl_model->getSeedInfo(); if (isset($_REQUEST["page_recrawl_frequency"]) && in_array($_REQUEST["page_recrawl_frequency"], array_keys($data['RECRAWL_FREQS']))) { $seed_info["general"]["page_recrawl_frequency"] = $_REQUEST["page_recrawl_frequency"]; } if (isset($_REQUEST["page_range_request"]) && in_array($_REQUEST["page_range_request"], $data['SIZE_VALUES'])) { $seed_info["general"]["page_range_request"] = $_REQUEST["page_range_request"]; } if (isset($_REQUEST['summarizer_option']) && in_array($_REQUEST['summarizer_option'], array_keys($data['available_summarizers']))) { $seed_info['general']['summarizer_option'] = $_REQUEST['summarizer_option']; } if (isset($_REQUEST["max_description_len"]) && in_array($_REQUEST["max_description_len"], $data['LEN_VALUES'])) { $seed_info["general"]["max_description_len"] = $_REQUEST["max_description_len"]; } if (isset($_REQUEST["cache_pages"]) ) { $seed_info["general"]["cache_pages"] = true; } else if (isset($_REQUEST['posted'])) { //form sent but check box unchecked $seed_info["general"]["cache_pages"] = false; } if (isset($_REQUEST['page_rules'])) { $seed_info['page_rules']['rule'] = $parent->convertStringCleanArray( $_REQUEST['page_rules'], 'rule'); } } if (!isset($seed_info["general"]["page_recrawl_frequency"])) { $seed_info["general"]["page_recrawl_frequency"] = C\PAGE_RECRAWL_FREQUENCY; } $data['summarizer_option'] = isset( $seed_info['general']['summarizer_option']) ? $seed_info['general']['summarizer_option'] : self::BASIC_SUMMARIZER; $data['PAGE_RECRAWL_FREQUENCY'] = $seed_info["general"]["page_recrawl_frequency"]; if (!isset($seed_info["general"]["cache_pages"])) { $seed_info["general"]["cache_pages"] = false; } $data["CACHE_PAGES"] = $seed_info["general"]["cache_pages"]; if (!isset($seed_info["general"]["page_range_request"])) { $seed_info["general"]["page_range_request"] = C\PAGE_RANGE_REQUEST; } $data['PAGE_SIZE'] = $seed_info["general"]["page_range_request"]; if (!isset($seed_info["general"]["max_description_len"])) { $seed_info["general"]["max_description_len"] = C\MAX_DESCRIPTION_LEN; } $data['MAX_LEN'] = $seed_info["general"]["max_description_len"]; $data['INDEXING_PLUGINS'] = []; $included_plugins = []; if (isset($_REQUEST["posted"]) && !$loaded) { $seed_info['indexing_plugins']['plugins'] = (isset($_REQUEST["INDEXING_PLUGINS"])) ? $_REQUEST["INDEXING_PLUGINS"] : []; } $included_plugins = (isset($seed_info['indexing_plugins']['plugins'])) ? $seed_info['indexing_plugins']['plugins'] : []; foreach ($parent->getIndexingPluginList() as $plugin) { if ($plugin == "") {continue; } $plugin_name = ucfirst($plugin); $data['INDEXING_PLUGINS'][$plugin_name]['checked'] = (in_array($plugin_name, $included_plugins)) ? "checked='checked'" : ""; /* to use method_exists we want that the require_once for the plugin class has occurred so we instantiate the object via the plugin method call which will also do the require if needed. */ $plugin_object = $parent->plugin(lcfirst($plugin_name)); $class_name = C\NS_PLUGINS . $plugin_name."Plugin"; if ($loaded && method_exists($class_name, 'setConfiguration') && method_exists($class_name, 'loadDefaultConfiguration')) { if (isset($seed_info['indexing_plugins']['plugins_data'][ $plugin_name])) { $plugin_object->setConfiguration($seed_info[ 'indexing_plugins']['plugins_data'][$plugin_name]); } else { $plugin_object->loadDefaultConfiguration(); } $plugin_object->saveConfiguration(); } if (method_exists($class_name, 'configureHandler') && method_exists($class_name, 'configureView')) { $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = true; $plugin_object->configureHandler($data); } else { $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = false; } } $profile = $profile_model->getProfile(C\WORK_DIRECTORY); if (!isset($_REQUEST['load_option'])) { $data = array_merge($data, $profile); } else { $parent->updateProfileFields($data, $profile, ['IP_LINK','CACHE_LINK', 'SIMILAR_LINK', 'IN_LINK', 'RESULT_SCORE', 'SIGNIN_LINK', 'SUBSEARCH_LINK', 'WORD_SUGGEST']); } $weights = ['TITLE_WEIGHT' => 4, 'DESCRIPTION_WEIGHT' => 1, 'LINK_WEIGHT' => 2, 'MIN_RESULTS_TO_GROUP' => 200, 'SERVER_ALPHA' => 1.6]; $change = false; foreach ($weights as $weight => $value) { if (isset($_REQUEST[$weight])) { $data[$weight] = $parent->clean($_REQUEST[$weight], 'float', 1 ); $profile[$weight] = $data[$weight]; $change = true; } else if (isset($profile[$weight]) && $profile[$weight] != ""){ $data[$weight] = $profile[$weight]; } else { $data[$weight] = $value; $profile[$weight] = $data[$weight]; $change = true; } } if ($change == true) { $profile_model->updateProfile(C\WORK_DIRECTORY, [], $profile); } $data['INDEXED_FILE_TYPES'] = []; $filetypes = []; foreach (PageProcessor::$indexed_file_types as $filetype) { $ison =false; if (isset($_REQUEST["filetype"]) && !$loaded) { if (isset($_REQUEST["filetype"][$filetype])) { $filetypes[] = $filetype; $ison = true; $change = true; } } else { if (isset($seed_info["indexed_file_types"]["extensions"]) && in_array($filetype, $seed_info["indexed_file_types"]["extensions"])) { $filetypes[] = $filetype; $ison = true; } } $data['INDEXED_FILE_TYPES'][$filetype] = ($ison) ? "checked='checked'" :''; } $seed_info["indexed_file_types"]["extensions"] = $filetypes; $data['CLASSIFIERS'] = []; $data['RANKERS'] = []; $active_classifiers = []; $active_rankers = []; foreach (Classifier::getClassifierList() as $classifier) { $label = $classifier->class_label; $ison = false; if (isset($_REQUEST['classifier']) && !$loaded) { if (isset($_REQUEST['classifier'][$label])) { $ison = true; } } else if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_classifiers']['label'])) { if (in_array($label, $seed_info['active_classifiers']['label'])) { $ison = true; } } if ($ison) { $data['CLASSIFIERS'][$label] = 'checked="checked"'; $active_classifiers[] = $label; } else { $data['CLASSIFIERS'][$label] = ''; } $ison = false; if (isset($_REQUEST['ranker']) && !$loaded) { if (isset($_REQUEST['ranker'][$label])) { $ison = true; } } else if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_rankers']['label'])) { if (isset($seed_info['active_rankers']['label']) && in_array($label, $seed_info['active_rankers']['label'])) { $ison = true; } } if ($ison) { $data['RANKERS'][$label] = 'checked="checked"'; $active_rankers[] = $label; } else { $data['RANKERS'][$label] = ''; } } $parent->pagingLogic($data, 'CLASSIFIERS', 'CLASSIFIERS', C\DEFAULT_ADMIN_PAGING_NUM/5, [], "", ['name' => 'class_label']); $seed_info['active_classifiers']['label'] = $active_classifiers; $seed_info['active_rankers']['label'] = $active_rankers; if (isset($seed_info['page_rules']['rule'])) { if (isset($seed_info['page_rules']['rule']['rule'])) { $data['page_rules'] = $parent->convertArrayLines( $seed_info['page_rules']['rule']['rule']); } else { $data['page_rules'] = $parent->convertArrayLines( $seed_info['page_rules']['rule']); } } else { $data['page_rules'] = ""; } $allowed_options = ['crawl_time', 'search_time', 'named_entity', 'test_options']; if (isset($_REQUEST['option_type']) && in_array($_REQUEST['option_type'], $allowed_options)) { $data['option_type'] = $_REQUEST['option_type']; } else { $data['option_type'] = 'crawl_time'; } if ($data['option_type'] == 'crawl_time') { $data['crawl_time_active'] = "active"; $data['search_time_active'] = ""; $data['named_entity_active'] = ""; $data['test_options_active'] = ""; $data['SCRIPT'] .= "\nswitchTab('crawltimetab',". "'searchtimetab', 'namedentitytab', 'testoptionstab')\n"; } else if ($data['option_type'] == 'search_time') { $data['search_time_active'] = "active"; $data['crawl_time_active'] = ""; $data['named_entity_active'] = ""; $data['test_options_active'] = ""; $data['SCRIPT'] .= "\nswitchTab('searchtimetab',". "'crawltimetab', 'namedentitytab', 'testoptionstab')\n"; } else if ($data['option_type'] == 'named_entity') { $data['search_time_active'] = ""; $data['crawl_time_active'] = ""; $data['named_entity_active'] = "active"; $data['test_options_active'] = ""; $data['SCRIPT'] .= "\nswitchTab('namedentitytab'," . "'crawltimetab', 'searchtimetab', 'testoptionstab');\n"; } else { $data['search_time_active'] = ""; $data['crawl_time_active'] = ""; $data['named_entity_active'] = ""; $data['test_options_active'] = "active"; $data['SCRIPT'] .= "\nswitchTab('testoptionstab',". "'crawltimetab', 'searchtimetab', 'namedentitytab');\n"; } $crawl_model->setSeedInfo($seed_info); if ($change == true && $data['option_type'] != 'test_options' && $data['option_type'] != 'named_entity') { return $parent->redirectWithMessage( tl('crawl_component_page_options_updated'), ["option_type"], true); } $test_processors = [ "text/html" => "html", "text/asp" => "html", "text/xml" => "xml", "text/robot" => "robot", "application/xml" => "xml", "application/xhtml+xml" => "html", "application/rss+xml" => "rss", "application/atom+xml" => "rss", "text/csv" => "text", "text/gopher" => "gopher", "text/plain" => "text", "text/rtf" => "rtf", "text/tab-separated-values" => "text", ]; $data['MIME_TYPES'] = array_keys($test_processors); $data['page_type'] = "text/html"; if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) { $data['page_type'] = $_REQUEST['page_type']; } $data['TESTPAGE'] = (isset($_REQUEST['TESTPAGE'])) ? $parent->clean($_REQUEST['TESTPAGE'], 'string') : ""; if ($data['option_type'] == 'test_options' && $data['TESTPAGE'] !="") { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >". tl('crawl_component_page_options_running_tests')."</h1>');"; $data['PROCESS_TIMES'] = []; $start_time = microtime(true); $site = []; $site[self::ENCODING] = "UTF-8"; $site[self::URL] = "http://test-site.yioop.com/"; $site[self::IP_ADDRESSES] = ["1.1.1.1"]; $site[self::HTTP_CODE] = 200; $site[self::MODIFIED] = date("U", time()); $site[self::TIMESTAMP] = time(); $site[self::TYPE] = "text/html"; $site[self::HEADER] = "page options test extractor"; $site[self::SERVER] = "unknown"; $site[self::SERVER_VERSION] = "unknown"; $site[self::OPERATING_SYSTEM] = "unknown"; $site[self::LANG] = 'en-US'; $site[self::JUST_METAS] = false; if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) { $site[self::TYPE] = $_REQUEST['page_type']; } if ($site[self::TYPE] == 'text/html') { $site[self::ENCODING] = L\guessEncodingHtml($_REQUEST['TESTPAGE']); } $prefix_name = $test_processors[$site[self::TYPE]]; $processor_name = ucfirst($prefix_name). "Processor"; $plugin_processors = []; if (isset($seed_info['indexing_plugins']['plugins'])) { foreach ($seed_info['indexing_plugins']['plugins'] as $plugin){ if ($plugin == "") { continue; } $plugin_name = C\NS_PLUGINS . $plugin."Plugin"; $tmp_object = new $plugin_name(); $supported_processors = $tmp_object->getProcessors(); foreach ($supported_processors as $supported_processor) { $parent_processor = C\NS_PROCESSORS . $processor_name; do { if (C\NS_PROCESSORS .$supported_processor == $parent_processor) { $plugin_object = $parent->plugin(lcfirst($plugin)); if (method_exists($plugin_name, "loadConfiguration")) { $plugin_object->loadConfiguration(); } $plugin_processors[] = $plugin_object; break; } } while(($parent_processor = get_parent_class($parent_processor)) && $parent_processor != "PageProcessor"); } } } $processor_name = C\NS_PROCESSORS. $processor_name; $page_processor = new $processor_name($plugin_processors, $seed_info["general"]["max_description_len"], $seed_info["general"]["summarizer_option"]); set_error_handler(null); $data["PAGE_RANGE_REQUEST"] = $seed_info["general"][ "page_range_request"]; if (L\generalIsA($processor_name, C\NS_PROCESSORS. "HtmlProcessor")) { P\HtmlProcessor::$page_options_testing = true; $page_processor->scrapers = $parent->model("scraper" )->getAllScrapers(); } $doc_info = $page_processor->handle( substr($_REQUEST['TESTPAGE'], 0, $data["PAGE_RANGE_REQUEST"]), $site[self::URL]); set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); if (!$doc_info) { $data["AFTER_PAGE_PROCESS"] = ""; $data["AFTER_RULE_PROCESS"] = ""; $data["EXTRACTED_WORDS"] = ""; $data["EXTRACTED_META_WORDS"] =""; return $data; } if ($processor_name != C\NS_PROCESSORS . "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) { $doc_info[self::LINKS] = UrlParser::pruneLinks( $doc_info[self::LINKS]); } foreach ($doc_info as $key => $value) { $site[$key] = $value; } if (isset($site[self::PAGE])) { unset($site[self::PAGE]); } if (isset($site[self::ROBOT_PATHS])) { $site[self::JUST_METAS] = true; } $reflect = new \ReflectionClass(C\NS_LIB . "CrawlConstants"); $crawl_constants = $reflect->getConstants(); $crawl_keys = array_keys($crawl_constants); $crawl_values = array_values($crawl_constants); $inverse_constants = array_combine($crawl_values, $crawl_keys); $after_process = []; foreach ($site as $key => $value) { $out_key = (isset($inverse_constants[$key])) ? $inverse_constants[$key] : $key; $after_process[$out_key] = $value; } $data["AFTER_PAGE_PROCESS"] = wordwrap($parent->clean( print_r($after_process, true), "string"), 75, "\n", true); $data['PROCESS_TIMES']['PAGE_PROCESS'] = L\changeInMicrotime( $start_time); $rule_time = microtime(true); $rule_string = implode("\n", $seed_info['page_rules']['rule']); $rule_string = html_entity_decode($rule_string, ENT_QUOTES); $page_rule_parser = new PageRuleParser($rule_string); $page_rule_parser->executeRuleTrees($site); $after_process = []; foreach ($site as $key => $value) { $out_key = (isset($inverse_constants[$key])) ? $inverse_constants[$key] : $key; $after_process[$out_key] = $value; } $data["AFTER_RULE_PROCESS"] = wordwrap($parent->clean( print_r($after_process, true), "string"), 75, "\n", true); $lang = null; $data['PROCESS_TIMES']['RULE_PROCESS'] = L\changeInMicrotime( $rule_time); $rule_time = microtime(true); if (isset($site[self::LANG])) { $lang = $site[self::LANG]; } $meta_ids = PhraseParser::calculateMetas($site); if (!$site[self::JUST_METAS]) { $host_words = UrlParser::getWordsIfHostUrl($site[self::URL]); $path_words = UrlParser::getWordsLastPathPartUrl( $site[self::URL]); $phrase_string = $host_words." .. ".$site[self::TITLE] . " .. ". $path_words . " .. ". $site[self::DESCRIPTION]; if ($site[self::TITLE] != "" ) { $lang = L\guessLocaleFromString($site[self::TITLE], $lang); } else { $lang = L\guessLocaleFromString( substr($site[self::DESCRIPTION], 0, C\AD_HOC_TITLE_LENGTH), $lang); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (PhraseParser::computeSafeSearchScore( $word_lists['WORD_LIST'], $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (!isset($word_lists['WORD_LIST'])) { $word_lists['WORD_LIST'] = []; } if (!isset($word_lists['QUESTION_ANSWER_LIST'])) { $word_lists['QUESTION_ANSWER_LIST'] = []; } $data["EXTRACTED_WORDS"] = wordwrap($parent->clean( print_r($word_lists['WORD_LIST'], true), "string"), 75, "\n", true); $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean( print_r($meta_ids, true), "string"), 75, "\n", true); $data["QUESTIONS_TRIPLET"] = wordwrap($parent->clean( print_r($word_lists['QUESTION_ANSWER_LIST'], true), "string"), 75, "\n", true); $data['PROCESS_TIMES']['TOTAL'] = L\changeInMicrotime($start_time); $data['PROCESS_TIMES'] = array_merge($data['PROCESS_TIMES'], $word_lists['TIMES']); } else if ($data['option_type'] == 'named_entity') { $data= array_merge($data, CrawlComponent::addNamedEntites($data)); } $languages = $parent->model("locale")->getLocaleList(); foreach ($languages as $language) { $data['LOCALES'][$language['LOCALE_TAG']] = $language['LOCALE_NAME']; } $entity_model = $parent->model("entity"); $locale_model = $parent->model("locale"); $locale_tag = L\getLocaleTag(); $data['CURRENT_LOCALE_TAG'] = !empty($_REQUEST['LOCALE_TAG']) ? $_REQUEST['LOCALE_TAG'] : $locale_tag; //$locale_tag = $data['CURRENT_LOCALE_TAG']; $locale_tag = $data['CURRENT_LOCALE_TAG']; $data['CURRENT_LOCALE_NAME'] = $locale_model->getLocaleName($locale_tag); $data['ENTITIES'] = $entity_model->viewAll($locale_tag); $data['NUM_ENTITIES_SHOW'] = 100; $data['TOTAL_ENTITIES'] = count($data['ENTITIES']); $data['LIMIT'] = (isset($_REQUEST['limit'])) ? min($parent->clean($_REQUEST['limit'], 'int'), $data['TOTAL_ENTITIES']) : 0; $data['ENTITIES'] = array_slice($data['ENTITIES'], $data['LIMIT'], $data['NUM_ENTITIES_SHOW']); $data['upload_options'] = [tl('pageoptions_element_actions'), tl('pageoptions_element_add_entity'), tl('pageoptions_element_file_upload')]; $data['entity_default'] = [tl('pageoptions_element_actions')]; return $data; } /** * Handles admin request related to the Scrapers activity * * This activity allows a user to specify the configuration for the * ways we detect Scrapers * * @return array $data info about the Scraper settings */ public function scrapers() { $parent = $this->parent; $crawl_model = $parent->model("crawl"); $scraper_model = $parent->model("scraper"); $possible_arguments = ["add", "delete", "edit"]; $request_fields = ['start_row', 'num_show', 'end_row']; $data = []; $data["ELEMENT"] = "scrapers"; $data['SCRIPT'] = ""; $n = C\NUM_RESULTS_PER_PAGE; $data['PER_PAGE'] = [$n => $n, 2*$n => 2*$n, 5*$n=> 5*$n, 10*$n=>10*$n]; if (isset($_REQUEST['per_page']) && in_array($_REQUEST['per_page'], array_keys($data['PER_PAGE']))) { $data['PER_PAGE_SELECTED'] = $_REQUEST['per_page']; } else { $data['PER_PAGE_SELECTED'] = C\NUM_RESULTS_PER_PAGE; } $data["CURRENT_SCRAPER"] = [ "name" => "", "signature" => "", "scrape_rules" => ""]; $data['FORM_TYPE'] = "add"; if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) { switch ($_REQUEST['arg']) { case "add": if (empty($_REQUEST['name']) || empty($_REQUEST['signature']) || empty($_REQUEST['scrape_rules'])) { return $parent->redirectWithMessage( tl('crawl_component_scraper_missing fields'), $request_fields); } $scraper_model->add( $parent->clean($_REQUEST['name'], "string"), $parent->clean($_REQUEST['signature'], "string"), $parent->clean($_REQUEST['scrape_rules'], "string")); return $parent->redirectWithMessage( tl('crawl_component_scraper_added'), $request_fields); break; case "delete": if (empty($_REQUEST['id'])) { return $parent->redirectWithMessage( tl('crawl_component_no_delete_scraper'), $request_fields); } $scraper_id = $parent->clean($_REQUEST['id'], "string"); $scraper_model->delete($scraper_id); return $parent->redirectWithMessage( tl('crawl_component_scraper_deleted'), $request_fields); break; case "edit": $data['FORM_TYPE'] = "edit"; $scraper = false; $scraper_id = (isset($_REQUEST['id'])) ? $parent->clean($_REQUEST['id'], "string") : ""; if ($scraper_id) { $scraper = $scraper_model->get($scraper_id); } if (!$scraper) { $data['FORM_TYPE'] = "add"; break; } $data['id'] = $scraper_id; $update = false; foreach ($data['CURRENT_SCRAPER'] as $field => $value) { $upper_field = strtoupper($field); if (isset($_REQUEST[$field]) && $field != 'name') { $scraper[$upper_field] = $parent->clean( $_REQUEST[$field], "string"); $data['CURRENT_SCRAPER'][$field] = $scraper[$upper_field]; $update = true; } else if (!empty($scraper[$upper_field])){ $data['CURRENT_SCRAPER'][$field] = $scraper[$upper_field]; } } if ($update) { $scraper_model->update($scraper); $fields = array_merge(["arg", "id"], $request_fields); return $parent->redirectWithMessage( tl('crawl_component_scraper_updated'), $fields); } break; } } $parent->pagingLogic($data, $scraper_model, "SCRAPERS", C\DEFAULT_ADMIN_PAGING_NUM/5, [["NAME", "", "", "ASC"]]); return $data; } /** * Handles admin request related to the search filter activity * * This activity allows a user to specify hosts whose web pages are to be * filtered out the search results * * @return array $data info about the groups and their contents for a * particular crawl mix */ public function resultsEditor() { $parent = $this->parent; $filters_model = $parent->model("searchfilters"); $data["ELEMENT"] = "resultseditor"; $data['SCRIPT'] = ""; if (isset($_REQUEST['disallowed_sites'])) { $sites = $parent->convertStringCleanArray( $_REQUEST['disallowed_sites']); $disallowed_sites = []; foreach ($sites as $site) { $site = UrlParser::getHost($site); if (strlen($site) > 0) { $disallowed_sites[] = $site."/"; } } $data['disallowed_sites'] = implode("\n", $disallowed_sites); $filters_model->set($disallowed_sites); return $parent->redirectWithMessage( tl('crawl_component_results_editor_update'), ["URL", "TITLE", "DESCRIPTION"]); } if (!isset($data['disallowed_sites'])) { $data['disallowed_sites'] = implode("\n", $filters_model->getUrls()); } foreach (array("URL", "TITLE", "DESCRIPTION") as $field) { $data[$field] = (isset($_REQUEST[$field])) ? $parent->clean($_REQUEST[$field], "string") : ((isset($data[$field]) ) ? $data[$field] : ""); } if ($data["URL"] != "") { $data["URL"] = UrlParser::canonicalLink($data["URL"],""); } $tmp = tl('crawl_component_edited_pages'); $data["URL_LIST"] = array ($tmp => $tmp); $summaries = $filters_model->getEditedPageSummaries(); foreach ($summaries as $hash => $summary) { $data["URL_LIST"][$summary[self::URL]] = $summary[self::URL]; } if (isset($_REQUEST['arg']) ) { switch ($_REQUEST['arg']) { case "save_page": $missing_page_field = ($data["URL"] == "") ? true: false; if ($missing_page_field) { return $parent->redirectWithMessage( tl('crawl_component_results_editor_need_url'), ["URL", "TITLE", "DESCRIPTION"]); } else { $filters_model->updateResultPage( $data["URL"], $data["TITLE"], $data["DESCRIPTION"]); return $parent->redirectWithMessage( tl('crawl_component_results_editor_page_updated'), ["URL", "TITLE", "DESCRIPTION"]); } break; case "load_url": $hash_url = L\crawlHash($_REQUEST['LOAD_URL'], true); if (isset($summaries[$hash_url])) { $_REQUEST["URL"] = $parent->clean($_REQUEST['LOAD_URL'], "web-url"); $_REQUEST["TITLE"] = $summaries[$hash_url][self::TITLE]; $_REQUEST["DESCRIPTION"] = $summaries[$hash_url][ self::DESCRIPTION]; return $parent->redirectWithMessage( tl('crawl_component_results_editor_page_loaded'), ["URL", "TITLE", "DESCRIPTION"]); } break; } } return $data; } /** * Handles admin request related to the search sources activity * * The search sources activity allows a user to add/delete search sources * for video and news, it also allows a user to control which subsearches * appear on the SearchView page * * @return array $data info about current search sources, and current * sub-searches */ public function searchSources() { $parent = $this->parent; $crawl_model = $parent->model("crawl"); $source_model = $parent->model("source"); $possible_arguments = ["addsource", "deletesource", "addsubsearch", "deletesubsearch", "editsource", "editsubsearch"]; $request_fields = ['start_row', 'num_show', 'end_row', 'SUBstart_row','SUBnum_show', 'SUBend_row']; $data = []; $data["ELEMENT"] = "searchsources"; $data['SCRIPT'] = ""; $data['SOURCE_TYPES'] = [-1 => tl('crawl_component_media_kind'), "video" => tl('crawl_component_video'), "rss" => tl('crawl_component_rss_feed'), "json" => tl('crawl_component_json_feed'), "html" => tl('crawl_component_html_feed'), "regex" => tl('crawl_component_regex_feed'), ]; $source_type_flag = false; if (isset($_REQUEST['type']) && in_array($_REQUEST['type'], array_keys($data['SOURCE_TYPES']))) { $data['SOURCE_TYPE'] = $_REQUEST['type']; $source_type_flag = true; } else { $data['SOURCE_TYPE'] = -1; } $machine_urls = $parent->model("machine")->getQueueServerUrls(); $search_lists = $crawl_model->getCrawlList(false, true, $machine_urls); $data["SEARCH_LISTS"] = [-1 => tl('crawl_component_sources_indexes')]; foreach ($search_lists as $item) { $data["SEARCH_LISTS"]["i:".$item["CRAWL_TIME"]] = $item["DESCRIPTION"]; } if (isset($_SESSION['USER_ID'])) { $user = $_SESSION['USER_ID']; } else { $user = L\remoteAddress(); } $search_lists= $crawl_model->getMixList($user); foreach ($search_lists as $item) { $data["SEARCH_LISTS"]["m:".$item["TIMESTAMP"]] = $item["NAME"]; } $n = C\NUM_RESULTS_PER_PAGE; $data['PER_PAGE'] = [$n => $n, 2*$n => 2*$n, 5*$n=> 5*$n, 10*$n=>10*$n]; if (isset($_REQUEST['per_page']) && in_array($_REQUEST['per_page'], array_keys($data['PER_PAGE']))) { $data['PER_PAGE_SELECTED'] = $_REQUEST['per_page']; } else { $data['PER_PAGE_SELECTED'] = C\NUM_RESULTS_PER_PAGE; } $locales = $parent->model("locale")->getLocaleList(); $data["LANGUAGES"] = []; foreach ($locales as $locale) { $data["LANGUAGES"][$locale['LOCALE_TAG']] = $locale['LOCALE_NAME']; } if (isset($_REQUEST['language']) && in_array($_REQUEST['language'], array_keys($data["LANGUAGES"]))) { $data['SOURCE_LOCALE_TAG'] = $_REQUEST['language']; } else { $data['SOURCE_LOCALE_TAG'] = C\DEFAULT_LOCALE; } $data["CURRENT_SOURCE"] = [ "name" => "", "type"=> $data['SOURCE_TYPE'], "source_url" => "", "aux_info" => "", 'category' => "news", 'channel_path' => "", "image_xpath" =>"", 'item_path' => "", 'title_path' => "", 'description_path' => "", 'link_path' => "", "language" => $data['SOURCE_LOCALE_TAG']]; $data["CURRENT_SUBSEARCH"] = [ "locale_string" => "", "folder_name" =>"", "index_identifier" => "", "per_page" => $data['PER_PAGE_SELECTED']]; $data['SOURCE_FORM_TYPE'] = "addsource"; $data["SEARCH_FORM_TYPE"] = "addsubsearch"; if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) { switch ($_REQUEST['arg']) { case "addsource": if (!$source_type_flag) { return $parent->redirectWithMessage( tl('crawl_component_no_source_type'), $request_fields); } $must_have = ["name", "type", 'source_url']; $is_parse_feed = false; if (isset($_REQUEST['type']) && in_array($_REQUEST['type'], ['html', 'json', 'regex'] )) { $is_parse_feed = true; $must_have = array_merge($must_have, [ 'channel_path', 'item_path', 'title_path', 'description_path', 'link_path']); } if (isset($_REQUEST['type']) && $_REQUEST['type'] == -1) { return $parent->redirectWithMessage( tl('crawl_component_missing_type'), array_merge($request_fields, $must_have)); } $to_clean = array_merge($must_have, ['aux_info', 'category','language', 'image_xpath']); foreach ($to_clean as $clean_me) { $r[$clean_me] = (isset($_REQUEST[$clean_me])) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : ""; if ($clean_me == "source_url") { $r[$clean_me] = UrlParser::canonicalLink( $r[$clean_me], ""); if (!$r[$clean_me]) { return $parent->redirectWithMessage( tl('crawl_component_invalid_url'), array_merge($request_fields, $to_clean)); } } if (in_array($clean_me, $must_have) && $r[$clean_me] == "" ) { return $parent->redirectWithMessage( tl('crawl_component_missing_fields'), array_merge($request_fields, $to_clean)); } } if ($is_parse_feed) { $r['aux_info'] = $r['channel_path']."###". $r['item_path']."###".$r['title_path']. "###".$r['description_path']."###".$r['link_path']. "###".$r['image_xpath']."###".$r['category']; } else if (isset($_REQUEST['type']) && $_REQUEST['type'] == 'rss') { $r['aux_info'] = $r['image_xpath'] . "###" . $r['category']; } $source_model->addMediaSource( $r['name'], $r['type'], $r['source_url'], $r['aux_info'], $r['language']); return $parent->redirectWithMessage( tl('crawl_component_media_source_added'), $request_fields); break; case "addsubsearch": $to_clean = ["folder_name", 'index_identifier']; $must_have = $to_clean; foreach ($to_clean as $clean_me) { $r[$clean_me] = (isset($_REQUEST[$clean_me])) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : ""; if (in_array($clean_me, $must_have) && ($r[$clean_me] == "" || $r[$clean_me] == -1)) { return $parent->redirectWithMessage( tl('crawl_component_missing_fields'), array_merge($request_fields, $to_clean)); } } $source_model->addSubsearch( $r['folder_name'], $r['index_identifier'], $data['PER_PAGE_SELECTED']); return $parent->redirectWithMessage( tl('crawl_component_subsearch_added'), $request_fields); break; case "deletesource": if (!isset($_REQUEST['ts'])) { return $parent->redirectWithMessage( tl('crawl_component_no_delete_source'), $request_fields); } $timestamp = $parent->clean($_REQUEST['ts'], "string"); $source_model->deleteMediaSource($timestamp); return $parent->redirectWithMessage( tl('crawl_component_media_source_deleted'), $request_fields); break; case "deletesubsearch": if (!isset($_REQUEST['fn'])) { return $parent->redirectWithMessage( tl('crawl_component_no_delete_source'), $request_fields); break; } $folder_name = $parent->clean($_REQUEST['fn'], "string"); $source_model->deleteSubsearch($folder_name); return $parent->redirectWithMessage( tl('crawl_component_subsearch_deleted'), $request_fields); break; case "editsubsearch": $data['SEARCH_FORM_TYPE'] = "editsubsearch"; $subsearch = false; $folder_name = (isset($_REQUEST['fn'])) ? $parent->clean($_REQUEST['fn'], "string") : ""; if ($folder_name) { $subsearch = $source_model->getSubsearch($folder_name); } if (!$subsearch) { $data['SOURCE_FORM_TYPE'] = "addsubsearch"; break; } $data['fn'] = $folder_name; $update = false; foreach ($data['CURRENT_SUBSEARCH'] as $field => $value) { $upper_field = strtoupper($field); if (isset($_REQUEST[$field]) && $field != 'name') { $subsearch[$upper_field] = $parent->clean( $_REQUEST[$field], "string"); $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field]; $update = true; } else if (isset($subsearch[$upper_field])){ $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field]; } } if ($update) { $fields = array_merge(array("arg", "fn"), $request_fields); $source_model->updateSubsearch($subsearch); return $parent->redirectWithMessage( tl('crawl_component_subsearch_updated'), $fields); } break; case "editsource": $data['SOURCE_FORM_TYPE'] = "editsource"; $source = false; $timestamp = (isset($_REQUEST['ts'])) ? $parent->clean($_REQUEST['ts'], "string") : ""; if ($timestamp) { $source = $source_model->getMediaSource($timestamp); } if (!$source) { $data['SOURCE_FORM_TYPE'] = "addsource"; break; } $data['ts'] = $timestamp; $update = false; $is_parse_feed = false; $is_rss_feed = false; if (in_array($source['TYPE'], ['html', 'json', 'regex'])) { $is_parse_feed = true; $aux_parts = explode("###", $source['AUX_INFO']); list($source['CHANNEL_PATH'], $source['ITEM_PATH'], $source['TITLE_PATH'], $source['DESCRIPTION_PATH'], $source['LINK_PATH']) = $aux_parts; if (isset($aux_parts[5])) { $source['IMAGE_XPATH'] = $aux_parts[5]; if (isset($aux_parts[6])) { $source['CATEGORY'] = $aux_parts[6]; } } else { $source['IMAGE_XPATH'] = ""; $source['CATEGORY'] = "news"; } } else if ($source['TYPE'] == 'rss') { $is_rss_feed = true; $aux_parts = explode("###", $source['AUX_INFO']); if (isset($aux_parts[0])) { $source['IMAGE_XPATH'] = $aux_parts[0]; if (isset($aux_parts[1])) { $source['CATEGORY'] = $aux_parts[1]; } } else { $source['IMAGE_XPATH'] = ""; $source['CATEGORY'] = "news"; } } foreach ($data['CURRENT_SOURCE'] as $field => $value) { $upper_field = strtoupper($field); if (isset($_REQUEST[$field]) && $field != 'name') { $source[$upper_field] = $parent->clean( $_REQUEST[$field], "string"); $data['CURRENT_SOURCE'][$field] = $source[$upper_field]; $update = true; } else if (isset($source[$upper_field])){ $data['CURRENT_SOURCE'][$field] = $source[$upper_field]; } } if ($update) { if ($is_parse_feed) { $source['AUX_INFO'] = $source['CHANNEL_PATH']."###" . $source['ITEM_PATH']."###". $source['TITLE_PATH'] . "###" . $source['DESCRIPTION_PATH'] . "###". $source['LINK_PATH']. "###". $source['IMAGE_XPATH']. "###". $source['CATEGORY']; } else if ($is_rss_feed) { $source['AUX_INFO'] = $source['IMAGE_XPATH'] . "###" . $source['CATEGORY']; } unset($source['CHANNEL_PATH']); unset($source['ITEM_PATH']); unset($source['TITLE_PATH']); unset($source['DESCRIPTION_PATH']); unset($source['LINK_PATH']); unset($source['IMAGE_XPATH']); unset($source['CATEGORY']); $source_model->updateMediaSource($source); $fields = array_merge(array("arg", "ts"), $request_fields); return $parent->redirectWithMessage( tl('crawl_component_media_source_updated'), $fields); } break; } } $data['CAN_LOCALIZE'] = $parent->model("user")->isAllowedUserActivity( $_SESSION['USER_ID'], "manageLocales"); $parent->pagingLogic($data, $source_model, "MEDIA_SOURCES", C\DEFAULT_ADMIN_PAGING_NUM/5, [["NAME", "", "", "ASC"]]); $parent->pagingLogic($data, $source_model, "SUBSEARCHES", C\DEFAULT_ADMIN_PAGING_NUM/5, [ ["FOLDER_NAME", "", "", "ASC"]], "SUB", "SUBSEARCH"); foreach ($data["SUBSEARCHES"] as $search) { if (!isset($data["SEARCH_LISTS"] [trim($search['INDEX_IDENTIFIER'])])) { $source_model->deleteSubsearch($search["FOLDER_NAME"]); } } $data['SCRIPT'] .= "source_type = elt('source-type');". "source_type.onchange = switchSourceType;". "switchSourceType()"; return $data; } /** * * @param array $data */ public function addNamedEntites($data) { $parent = $this->parent; $entity_model = $parent->model("entity"); $locale_model = $parent->model("locale"); if (isset($_REQUEST['ENTITY_NAME'])) { $entity = $_REQUEST['ENTITY_NAME']; $locale = $_REQUEST['LOCALE_TAG']; $entity_model->addEntity($entity, $locale); } else if (isset($_FILES['ENTITY_FILE'])) { $file_name = $_FILES['ENTITY_FILE']['tmp_name']; if (!empty($file_name)) { $locale = $_REQUEST['LOCALE_TAG']; $entity_model->addEntities($file_name, $locale); } } else { if (!empty($_REQUEST['arg'])) { switch ($_REQUEST['arg']) { case 'deleteentities': if (isset($_REQUEST['value']) && isset($_REQUEST['LOCALE_TAG'])) { $entity = $_REQUEST['value']; $locale = $_REQUEST['LOCALE_TAG']; $entity_model->deleteEntity($entity, $locale); } break; } } } return $data; } }