diff --git a/bin/fetcher.php b/bin/fetcher.php index 1cd0a367c..faf4e9820 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -529,7 +529,6 @@ class Fetcher implements CrawlConstants crawlLog("New name: ".$this->web_archive->dir_name); crawlLog("Switching archive..."); - } if(isset($info[self::SAVED_CRAWL_TIMES])) { @@ -661,19 +660,20 @@ class Fetcher implements CrawlConstants function downloadPagesArchiveCrawl() { $prefix = $this->fetcher_num."-"; - $base_name = CRAWL_DIR."/cache/{$prefix}".self::archive_base_name. - $this->crawl_index; + $arc_name = "$prefix" . self::archive_base_name . $this->crawl_index; + $base_name = CRAWL_DIR."/cache/$arc_name"; $pages = array(); if(!isset($this->archive_iterator->iterate_timestamp) || $this->archive_iterator->iterate_timestamp != $this->crawl_index || $this->archive_iterator->result_timestamp != $this->crawl_time) { if(!file_exists($base_name)){ - crawlLog("Recrawl archive with timestamp" . - " {$this->crawl_index} does not exist!"); + crawlLog("!!Fetcher web archive $arc_name does not exist."); + crawlLog(" Only fetchers involved in original crawl will "); + crawlLog(" participate in a web archive recrawl!!"); return $pages; } else { $this->archive_iterator = - new WebArchiveBundle($prefix, $this->crawl_index, + new WebArchiveBundleIterator($prefix, $this->crawl_index, $this->crawl_time); if($this->archive_iterator == NULL) { crawlLog("Error creating archive iterator!!"); @@ -843,8 +843,9 @@ class Fetcher implements CrawlConstants } else { $update_num = SEEN_URLS_BEFORE_UPDATE_SCHEDULER; crawlLog("Fetch on crawl {$this->crawl_time} was not ". - "halted properly, dumping $update_num from old fetch ". - "to try to make a clean re-start"); + "halted properly."); + crawlLog(" Dumping $update_num from old fetch ". + "to try to make a clean re-start."); $count = count($this->to_crawl); if($count > SEEN_URLS_BEFORE_UPDATE_SCHEDULER) { $this->to_crawl = array_slice($this->to_crawl, diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php index 69988a4db..54e2886ca 100755 --- a/controllers/admin_controller.php +++ b/controllers/admin_controller.php @@ -860,13 +860,6 @@ class AdminController extends Controller implements CrawlConstants $crawl_params[self::META_WORDS] = isset($seed_info['meta_words']) ? $seed_info['meta_words'] : array(); - $crawl_params[self::LOG_RECORDS] = isset($seed_info['log_records']) ? - $seed_info['log_records'] : array(); - - $crawl_params[self::DATABASE_CONNECTION_DETAILS] = - isset($seed_info['database_connection_details']) ? - $seed_info['database_connection_details'] : array(); - if(isset($seed_info['indexing_plugins']['plugins'])) { $crawl_params[self::INDEXING_PLUGINS] = $seed_info['indexing_plugins']['plugins']; @@ -1085,48 +1078,6 @@ class AdminController extends Controller implements CrawlConstants $data['META_WORDS'] = $seed_info['meta_words']; } - $data['LOG_RECORDS'] = array(); - if(!$no_further_changes) { - if(isset($_REQUEST["LOG_RECORDS"])){ - foreach($_REQUEST["LOG_RECORDS"] as $triplet) { - list($field, $field_name,$field_type) = - array_values($triplet); - $field = $this->clean($field, "string"); - $field_name = - $this->clean($field_name, "string"); - $field_type = - $this->clean($field_type,"string"); - $field_nt = $field_name."::".$field_type; - if(trim($field) != "" &&trim($field_nt) !=""){ - $data['LOG_RECORDS'][$field] = $field_nt; - } - } - $seed_info['log_records'] = $data['LOG_RECORDS']; - $update_flag = true; - } else if(isset($seed_info['log_records'])){ - $data['LOG_RECORDS'] = $seed_info['log_records']; - } - } else if(isset($seed_info['log_records'])){ - $data['LOG_RECORDS'] = $seed_info['log_records']; - } - - $data['DATABASE_CONNECTION_DETAILS'] = array(); - if(!$no_further_changes) { - if(isset($_REQUEST["DATABASE_CONNECTION_DETAILS"])){ - $data['DATABASE_CONNECTION_DETAILS']= - $_REQUEST["DATABASE_CONNECTION_DETAILS"]; - $seed_info['database_connection_details'] = - $data['DATABASE_CONNECTION_DETAILS']; - $update_flag = true; - } else if(isset($seed_info['database_connection_details'])) { - $data['DATABASE_CONNECTION_DETAILS'] = - $seed_info['database_connection_details']; - } - } else if(isset($seed_info['database_connection_details'])) { - $data['DATABASE_CONNECTION_DETAILS'] = - $seed_info['database_connection_details']; - } - $data['INDEXING_PLUGINS'] = array(); $included_plugins = array(); if(!$no_further_changes && isset($_REQUEST["posted"])) { diff --git a/css/search.css b/css/search.css index c94c889e1..65898598b 100755 --- a/css/search.css +++ b/css/search.css @@ -1734,107 +1734,6 @@ ul.in-list li width: 97%; } -.log-records-table -{ - width:100%; -} - -.log-records-table, -.log-records-table td, -.log-records-table th -{ - border: 1px ridge black; -} - -.log-records-table th -{ - padding: 0.03in; - text-align: center; -} - -.log-records-table td.input-field -{ - width: 1.3in; -} - -.log-records-table td.input-field input -{ - margin: 0.05in; - width: 1.5in; -} - -.log-records-table td.input-field-name -{ - margin: 0.03in; - width: 100%; -} - -.log-records-table td.input-field-name input -{ - margin: 0.05in; - width: 97%; -} - -.log-records-table td.input-field-type -{ - margin: 0.03in; - width: 100%; -} - -.log-records-table td.input-field-type input -{ - margin: 0.05in; - width: 97%; -} - -.html-rtl .log-record-new-field -{ - position: relative; -} - -.html-ltr .log-record-new-field -{ - position: relative; -} - -.database-connection-details-table -{ - width:100%; -} - -.database-connection-details-table, -.database-connection-details-table td -{ - border: 1px ridge black; -} - -.database-connection-details-table td.input-name -{ - width: 1.3in; -} - -.database-connection-details-table td.input-data -{ - margin: 0.03in; - width: 100%; -} - -.database-connection-details-table td.input-data input -{ - margin: 0.05in; - width: 98%; -} - -.html-rtl .database-connection-details-submit -{ - position: relative; -} - -.html-ltr .database-connection-details-submit -{ - position: relative; -} - .indexing-plugin-table { width:100%; diff --git a/lib/archive_bundle_iterators/database_archive_bundle_iterator.php b/lib/archive_bundle_iterators/database_archive_bundle_iterator.php deleted file mode 100644 index df94e54d2..000000000 --- a/lib/archive_bundle_iterators/database_archive_bundle_iterator.php +++ /dev/null @@ -1,375 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Tanmayee Potluri - * @package seek_quarry - * @subpackage iterator - * @license http://www.gnu.org/licenses/ GPL3 - * @link http://www.seekquarry.com/ - * @copyright 2009 - 2013 - * @filesource - */ - -if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} - -/** - *Loads base class for iterating - */ -require_once BASE_DIR. - '/lib/archive_bundle_iterators/archive_bundle_iterator.php'; - -/** - * Used to iterate through the records of a database stored in a - * DatabaseArchiveBundle folder. Database is a collection of tables with - * various rows in each table. Iteration would be for the purpose of making - * an index of each of these records. - * - * @author Tanmayee Potluri - * @package seek_quarry - * @subpackage iterator - * @see WebArchiveBundle - */ -class DatabaseArchiveBundleIterator extends ArchiveBundleIterator - implements CrawlConstants -{ - /** - * The path to the directory containing the archive partitions to be - * iterated over. - * @var string - */ - var $iterate_dir; - /** - * The path to the directory where the iteration status is stored. - * @var string - */ - var $result_dir; - /** - * The path to the directory where all html files are stored - * and used to point to. - * @var string - */ - var $index_dir; - - /** - * The part of the path to the directory where all html files are stored - * and used to point to. - * @var string - */ - var $path_for_html_files; - /** - * current number of record in the current database - * @var int - */ - var $current_page_num; - /** - * The number of database records in this database archive bundle - * @var int - */ - var $num_of_records; - /** - * Array of database records according to the query specified by the user - * @var array - */ - var $records; - /** - * Array of fields of database record specified by the user - * @var array - */ - var $fields; - /** - * Array of field names in database record specified by the user - * @var array - */ - var $field_names; - /** - * Array of database fieldtypes specified by the user - * @var array - */ - var $field_types; - /** - * Database handle for a database - * @var resource - */ - var $db_handle; - /** - * Whether database exists or not - * @var resource - */ - var $db_found; - /** - * Array of database connection details to connect to database - * @var array - */ - var $databaseConnectionsArray; - /** - * Host name for the database - * @var string - */ - var $host; - /** - * User Name for the localhost - * @var string - */ - var $user_name; - /** - * Password for the connection - * @var string - */ - var $password; - /** - * Name of the database for the records - * @var string - */ - var $database; - /** - * Query to retrieve the records to be indexed from the database - * @var array - */ - var $query; - /** - * Constants required for database archive_bundle_iterator - * @var const - */ - const DATABASE_CONNECTION_DETAILS_FILE = 'database_connection_details.txt'; - - /** - * Creates a database archive iterator with the given parameters. - * @param string $iterate_timestamp timestamp of the arc archive bundle to - * iterate over the pages of - * @param string $result_timestamp timestamp of the arc archive bundle - * results are being stored in - */ - function __construct($iterate_timestamp, $iterate_dir, - $result_timestamp, $result_dir) - { - $this->iterate_timestamp = $iterate_timestamp; - $this->iterate_dir = $iterate_dir; - $this->result_timestamp = $result_timestamp; - $this->result_dir = $result_dir; - $this->index_dir = CRAWL_DIR."/cache/IndexData".$this->result_timestamp; - $temp_array = explode("/", $this->index_dir); - $temp_array_len = count($temp_array); - - for($i = 1;$i<$temp_array_len;$i++){ - if($temp_array[$i-1]=="htdocs"){ - while($temp_array[$i] != "cache"){ - $this->path_for_html_files .= $temp_array[$i]."/"; - $i++; - } - break; - } - } - if(file_exists("{$this->iterate_dir}/". - self::DATABASE_CONNECTION_DETAILS_FILE)) - { - $database_connection_details_info = unserialize - (file_get_contents( - "{$this->iterate_dir}/".self::DATABASE_CONNECTION_DETAILS_FILE)); - file_put_contents("{$this->index_dir}/database_connection_details". - $this->result_timestamp.".txt", - serialize($database_connection_details_info)); - @unlink("{$this->iterate_dir}/". - self::DATABASE_CONNECTION_DETAILS_FILE); - } - if(!is_dir("{$this->index_dir}/HTML_FILES")){ - mkdir("{$this->index_dir}/HTML_FILES"); - } - $this->createRecords(); - - if(file_exists("{$this->result_dir}/iterate_status.txt")) { - $this->restoreCheckpoint(); - } - else { - $this->reset(); - } - } - - /** - * Estimates the important of the site according to the weighting of - * the particular archive iterator - * @param $site an associative array containing info about a web page - * @return bool false we assume arc files were crawled according to - * OPIC and so we use the default doc_depth to estimate page importance - */ - function weight(&$site) - { - return false; - } - - /** - * Resets the iterator to the start of the archive bundle - */ - function reset() - { - $this->current_page_num = -1; - $this->end_of_iterator = false; - @unlink("{$this->result_dir}/iterate_status.txt"); - } - - /** - * Saves the current state so that a new instantiation can pick up just - * after the last batch of pages extracted. - */ - function saveCheckpoint($info = array()) - { - $info['end_of_iterator'] = $this->end_of_iterator; - $info['current_page_num'] = $this->current_page_num; - $info['database_iterator'] = $this->database_iterator; - file_put_contents("{$this->result_dir}/iterate_status.txt", - serialize($info)); - } - - /** - * Restores state from a previous instantiation, after the last batch of - * pages extracted. - */ - function restoreCheckpoint() - { - $info = unserialize(file_get_contents( - "{$this->result_dir}/iterate_status.txt")); - $this->end_of_iterator = $info['end_of_iterator']; - $this->current_page_num = $info['current_page_num']; - $this->database_iterator = $info['database_iterator']; - return $info; - } - - /** - * Creates Records array containing all the records to satisfying the query - */ - function createRecords() - { - $this->databaseConnectionsArray = unserialize(file_get_contents( - "{$this->index_dir}/database_connection_details". - $this->result_timestamp.".txt")); - $this->host = $this->databaseConnectionsArray['HOSTNAME']; - $this->user_name = $this->databaseConnectionsArray['USERNAME']; - $this->password = $this->databaseConnectionsArray['PASSWORD']; - $this->database = $this->databaseConnectionsArray['DATABASENAME']; - $this->query = $this->databaseConnectionsArray['QUERY']; - $this->db_handle = mysql_connect($this->host, $this->user_name, - $this->password); - $this->db_found = mysql_select_db($this->database, $this->db_handle); - - /*If database exists*/ - if ($this->db_found) { - $result1 = mysql_query($this->query); - $num_fields = mysql_num_fields($result1); - for($i = 0; $i < $num_fields; $i++) { - $this->field_names[$i] = mysql_field_name($result1, $i); - } - while ($row = mysql_fetch_row($result1)) { - $this->records[] = $row; - } - $this->num_of_records = count($this->records); - mysql_free_result($result1); - mysql_close($db_handle); - } - } - /** - * Gets the next at most $num many records from the iterator. It might - * return less than $num many documents if the end of the bundle is reached. - * @param int $num number of docs to get - * @return array associative arrays for $num pages - */ - function nextPages($num) - { - $pages = array(); - for($i = 0; $i < $num; $i++) { - $this->current_page_num++; - $page = $this->nextPage(); - if($this->current_page_num >= $this->num_of_records) { - $this->end_of_iterator = true; - break; - } - else { - $pages[] = $page; - } - } - - $this->saveCheckpoint(); - return $pages; - } - - - /** - * Gets the next record from the iterator - * @return array associative array for record - */ - function nextPage() - { - $site = array(); - $field_nd = ""; - $html_page = ""; - $temp_record = array(); - $temp_record = $this->records[$this->current_page_num]; - $dom = new DOMDocument('1.0'); - $root =$dom->createElement('html'); - $root = $dom->appendChild($root); - $head = $dom->createElement('head'); - $head = $root->appendChild($head); - $title = $dom->createElement('title'); - $title = $head->appendChild($title); - $recordTitle = "Database Record".$this->current_page_num; - $text = $dom->createTextNode($recordTitle); - $text = $title->appendChild($text); - $body = $dom->createElement('body'); - $body = $root->appendChild($body); - $field = $dom->createElement('p'); - $field = $body->appendChild($field); - $fieldnames_len = count($this->field_names); - for($i = 0; $i < $fieldnames_len; $i++) { - $field_nd .= $this->field_names[$i]." : ".$temp_record[$i] - ."<br>"; - } - $text1 = $dom->createTextNode($field_nd); - $text1 = $field->appendChild($text1); - $desc = $dom->createElement('p'); - $desc = $body->appendChild($desc); - $text3 = "This is database record ".$this->current_page_num; - $text2 = $dom->createTextNode($text3); - $text2 = $desc->appendChild($text2); - $site[self::PAGE] =$dom->saveHTML(); - $html_page = "<html><head><title>DatabaseRecord". - $this->current_page_num. - "</title></head><body><h1>The details of the database record are:". - "<br></h1><h3>".$field_nd."</h3></body></html>"; - file_put_contents("{$this->index_dir}/HTML_FILES/databaserecord". - $this->current_page_num.".php",$html_page); - $site[self::URL] = "http://localhost/".$this->path_for_html_files. - "cache/IndexData".$this->result_timestamp. - "/HTML_FILES/databaserecord".$this->current_page_num.".php"; - $site[self::TYPE] ="text/html"; - $site[self::HTTP_CODE] = 200; - $site[self::ENCODING] = "UTF-8"; - $site[self::SERVER] = "unknown"; - $site[self::SERVER_VERSION] = "unknown"; - $site[self::OPERATING_SYSTEM] = "unknown"; - $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); - $site[self::WEIGHT] = 1; - - return $site; - - } -} -?> \ No newline at end of file diff --git a/lib/archive_bundle_iterators/log_archive_bundle_iterator.php b/lib/archive_bundle_iterators/log_archive_bundle_iterator.php deleted file mode 100644 index b5645b334..000000000 --- a/lib/archive_bundle_iterators/log_archive_bundle_iterator.php +++ /dev/null @@ -1,452 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Tanmayee Potluri - * @package seek_quarry - * @subpackage iterator - * @license http://www.gnu.org/licenses/ GPL3 - * @link http://www.seekquarry.com/ - * @copyright 2009 - 2013 - * @filesource - */ - -if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} - -/** - *Loads base class for iterating - */ -require_once BASE_DIR. - '/lib/archive_bundle_iterators/archive_bundle_iterator.php'; - -/** - * Used to iterate through the collection of log files stored in - * a WebArchiveBundle folder. Log is the file format which has the - * activities the system or a server performs. Iteration would be - * for the purpose making an index of these files. - * - * @author Tanmayee Potluri - * @package seek_quarry - * @subpackage iterator - * @see WebArchiveBundle - */ - -class LogArchiveBundleIterator extends ArchiveBundleIterator - implements CrawlConstants -{ - /** - * The path to the directory containing the archive partitions to be - * iterated over. - * @var string - */ - var $iterate_dir; - - /** - * The path to the directory where the iteration status is stored. - * @var string - */ - var $result_dir; - - /** - * The path to the directory where all html files are stored - * and used to point to. - * @var string - */ - var $index_dir; - - /** - * The part of the path to the directory where all html files are stored - * and used to point to. - * @var string - */ - var $path_for_html_files; - - /** - * The number of log files in this log archive bundle - * @var int - */ - var $num_partitions; - - /** - * current record number in the master log file - * @var int - */ - var $current_page_num; - - /** - * number of records in the master log file - * @var int - */ - var $num_of_records; - - /** - * Array of log records in the master log file in the directory - * @var array - */ - var $records; - - /** - * Array of filenames of log files in this directory (glob order) - * @var array - */ - var $partitions; - - /** - * Array of fields of log file specified by the user - * @var array - */ - var $fields; - - /** - * Array of fieldnames specified by the user - * @var array - */ - var $field_names; - - /** - * Array of fieldtypes specified by the user - * @var array - */ - var $field_types; - - /** - * Array of fields in each record separately - * @var array - */ - var $page_info; - - /** - * Array of regular expressions for all the data types - * @var array - */ - var $regular_exprs; - - /** - * Array of log fields type in drop down box in the UI - * @var array - */ - var $logfields_type_ddm = array( - 1=>'IP_Address', - 2=>'Timestamp', - 3=>'URL', - 4=>'Status Code', - 5=>'User Agent', - 6=>'Request', - 7=>'Int'); - - /** - * Constants required for log archive_bundle_iterator - * @var const - */ - const FIELDS_DATA_FILE = 'fields_data.txt'; - const MASTER_LOG_FILE = 'master.log'; - - - /** - * Creates a log archive iterator with the given parameters. - * - * @param string $iterate_timestamp timestamp of the log archive bundle to - * iterate over the pages of - * @param string $result_timestamp timestamp of the log archive bundle - * results are being stored in - */ - function __construct($iterate_timestamp, $iterate_dir, - $result_timestamp, $result_dir) - { - $this->regular_exprs = array( - 'IP_Address' => '/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/', - 'Timestamp' => '/\[[^:]+:\d+:\d+:\d+ [^\]]+\]/', - 'Request' => '/(GET|HEAD|POST|PUT|DELETE|TRACE|OPTIONS|CONNECT)+[^"]*/', - 'Status Code'=> '/\s[1-5]\d{2}\s/', - 'Int' => '/\s[0-9]+\s/', - 'User Agent' => '/"([a-zA-Z0-9][^"]+)"/'); - $this->regular_exprs['URL'] = '/(http|https|ftp):\/\/[A-Za-z0-9]'. - '[A-Za-z0-9_-]*[\/]*(?:.[A-Za-z0-9][A-Za-z0-9_-]*'. - '[\/]*)+:?(d*)[\/]*/'; - $this->path_for_html_files = ""; - $this->iterate_timestamp = $iterate_timestamp; - $this->iterate_dir = $iterate_dir; - $this->result_timestamp = $result_timestamp; - $this->result_dir = $result_dir; - $this->index_dir = CRAWL_DIR."/cache/IndexData".$this->result_timestamp; - $temp_array = explode("/", $this->index_dir); - $temp_array_len = count($temp_array); - - for($i = 1; $i < $temp_array_len; $i++) { - if($temp_array[$i-1] == "htdocs") { - while($temp_array[$i] != "cache") { - $this->path_for_html_files .= $temp_array[$i]."/"; - $i++; - } - break; - } - } - if(file_exists("{$this->iterate_dir}/".self::FIELDS_DATA_FILE)) { - $fields_data = unserialize(file_get_contents( - "{$this->iterate_dir}/".self::FIELDS_DATA_FILE)); - file_put_contents( - "{$this->index_dir}/fields_data".$this->result_timestamp.".txt", - serialize($fields_data)); - @unlink("{$this->iterate_dir}/".self::FIELDS_DATA_FILE); - } - if(!is_dir("{$this->index_dir}/HTML_FILES")) { - mkdir("{$this->index_dir}/HTML_FILES"); - } - $this->partitions = array(); - foreach(glob("{$this->iterate_dir}/*.log") as $filename) { - if(strpos($filename,self::MASTER_LOG_FILE)!= true) { - $this->partitions[] = $filename; - } - } - $this->num_partitions = count($this->partitions); - $this->records = $this->createMasterAndRecords(); - $this->num_of_records = count($this->records); - if(file_exists("{$this->result_dir}/iterate_status.txt")){ - $this->restoreCheckpoint(); - } - else { - $this->reset(); - } - } - - /** - * Estimates the important of the site according to the weighting of - * the particular archive iterator - * @param $site an associative array containing info about a web page - * @return value 1 we assume all log files crawled have the same - * page importance - */ - function weight(&$site) - { - return 1; - } - - /** - * Resets the iterator to the start of the archive bundle - */ - function reset() - { - $this->current_page_num = -1; - $this->end_of_iterator = false; - @unlink("{$this->result_dir}/iterate_status.txt"); - } - - /** - * Saves the current state so that a new instantiation can pick up just - * after the last batch of pages extracted. - */ - function saveCheckpoint($info = array()) - { - $info['end_of_iterator'] = $this->end_of_iterator; - $info['current_page_num'] = $this->current_page_num; - $info['log_iterator'] = $this->log_iterator; - file_put_contents("{$this->result_dir}/iterate_status.txt", - serialize($info)); - if($this->end_of_iterator == true){ - @unlink("{$this->iterate_dir}/".self::MASTER_LOG_FILE); - } - } - - /** - * Restores state from a previous instantiation, after the last batch of - * pages extracted. - */ - function restoreCheckpoint() - { - $info = unserialize(file_get_contents( - "{$this->result_dir}/iterate_status.txt")); - $this->end_of_iterator = $info['end_of_iterator']; - $this->current_page_num = $info['current_page_num']; - $this->log_iterator = $info['log_iterator']; - return $info; - } - - /** - * Pulls data from all the log files in the directory and place it in - * the master log file and splits the file into log records - * @return array of log records from the master log file - */ - function createMasterAndRecords(){ - if(file_exists("{$this->iterate_dir}/".self::MASTER_LOG_FILE)) { - @unlink("{$this->iterate_dir}/".self::MASTER_LOG_FILE); - } - for ($i=0; $i < $this->num_partitions; $i++){ - $file_data = file_get_contents($this->partitions[$i]); - file_put_contents("{$this->iterate_dir}/".self::MASTER_LOG_FILE, - $file_data, FILE_APPEND); - } - $recordArray = explode("\n", - file_get_contents("{$this->iterate_dir}/". - self::MASTER_LOG_FILE)); - - return $recordArray; - } - - /** - * Unserializes the array of log records stored when save - * options is clicked and stores them in the global arrays. - */ - function getFieldDetails() - { - $fieldArray = unserialize(file_get_contents( - "{$this->index_dir}/fields_data".$this->result_timestamp.".txt")); - foreach($fieldArray as $field=>$field_nt){ - $matches = explode("::",$field_nt); - $this->fields[] = $field; - $this->field_names[] = $matches[0]; - $this->field_types[] = $matches[1]; - } - } - - /** - * Takes the log record as input and parses the log record and returns - * the array containing the details of each record. - * - * @param string $record single log record in a file - * @return array $matches matched fields of a log record - */ - function parseLogRecord($record) - { - $content = ""; - $field_types_len = count($this->field_types); - for($j = 0; $j < $field_types_len; $j++) { - $content = ""; - $matches = array(); - preg_match( - $this->regular_exprs[$this->logfields_type_ddm[$this->field_types[$j]]], - $record,$matches); - if (count($matches)>0) { - $spaces_removed = trim($matches[0]); - $record = str_replace($spaces_removed,"",$record); - $return_page[$this->logfields_type_ddm[$this->field_types[$j]]] - = $matches[0]; - } - else { - $return_page[$this->logfields_type_ddm[$this->field_types[$j]]] - = "-"; - } - } - return $return_page; - } - - /** - * Gets the next at most $num many docs from the iterator. It might return - * less than $num many documents if the partition changes or the end of the - * bundle is reached. - * - * @param int $num number of docs to get - * @return array associative arrays for $num pages - */ - function nextPages($num) - { - $this->getFieldDetails(); - $pages = array(); - $page_count = 0; - for($i = 0; $i < $num; $i++) { - $this->current_page_num++; - $this->page_info - = $this->parseLogRecord($this->records[$this->current_page_num]); - $page = $this->nextPage(); - if($this->current_page_num >= $this->num_of_records) { - $this->end_of_iterator = true; - break; - } - else { - $pages[] = $page; - $page_count++; - } - } - $this->saveCheckpoint(); - return $pages; - } - - - /** - * Gets the next doc from the iterator - * @return array associative array for doc - */ - function nextPage() - { - $site = array(); - $field_nt = ""; - $fields_count = count($this->page_info); - $dom = new DOMDocument('1.0'); - $root =$dom->createElement('html'); - $root = $dom->appendChild($root); - $head = $dom->createElement('head'); - $head = $root->appendChild($head); - $title = $dom->createElement('title'); - $title = $head->appendChild($title); - for($i=0;$i<$fields_count;$i++){ - if($this->logfields_type_ddm[$this->field_types[$i]] =="Request" - && $this->page_info['Request'] !="-") { - $recordTitle = "Line ".$this->current_page_num.":". - $this->page_info[$this->logfields_type_ddm[$this->field_types[$i]]]; - break; - } - } - if($recordTitle == ""){ - $recordTitle = "Line ".$this->current_page_num; - } - $text = $dom->createTextNode($recordTitle); - $text = $title->appendChild($text); - $body = $dom->createElement('body'); - $body = $root->appendChild($body); - $field = $dom->createElement('p'); - $field = $body->appendChild($field); - for($i=0;$i<$fields_count;$i++){ - $field_nt .= $this->field_names[$i].":". - $this->page_info[$this->logfields_type_ddm[$this->field_types[$i]]] - ."<br/>"; - } - $text1 = $dom->createTextNode($field_nt); - $text1 = $field->appendChild($text1); - $desc = $dom->createElement('p'); - $desc = $body->appendChild($desc); - $text3 = "This is line ".$this->current_page_num; - $text2 = $dom->createTextNode($text3); - $text2 = $desc->appendChild($text2); - $site[self::PAGE] =$dom->saveHTML(); - $html_page = "<html><head><title>LogRecord".$this->current_page_num. - "</title></head><body><h1>". - "The details of the log record are: <br/></h1><h3>". - $field_nt."</h3></body></html>"; - file_put_contents("{$this->index_dir}/HTML_FILES/logrecord". - $this->current_page_num.".php",$html_page); - $site[self::URL] = "http://localhost/".$this->path_for_html_files. - "cache/IndexData".$this->result_timestamp. - "/HTML_FILES/logrecord" - .$this->current_page_num.".php"; - $site[self::TYPE] ="text/html"; - $site[self::HTTP_CODE] = 200; - $site[self::ENCODING] = "UTF-8"; - $site[self::SERVER] = "unknown"; - $site[self::SERVER_VERSION] = "unknown"; - $site[self::OPERATING_SYSTEM] = "unknown"; - $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); - $site[self::WEIGHT] = 1; - - return $site; - } -} -?> diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index 801d960af..6ad21a61d 100644 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -210,11 +210,8 @@ interface CrawlConstants const LINK_SEEN_URLS = 'cj'; const POST_MAX_SIZE = 'ck'; const LOGGING = 'cl'; - const LOG_RECORDS = 'cm'; - const DATABASE_RECORDS = 'cn'; - const DATABASE_CONNECTION_DETAILS = 'co'; const NEEDS_OFFSET_FLAG = 0x7FFFFFFF; } -?> \ No newline at end of file +?> diff --git a/scripts/suggest.js b/scripts/suggest.js index 564e50f96..fc693626d 100644 --- a/scripts/suggest.js +++ b/scripts/suggest.js @@ -104,7 +104,10 @@ function onTypeTerm(event, text_field) search_list_array = new Object(); scroll_horz = false; - out_query = transliterate(query); + out_query = false; + if(typeof transliterate == 'function') { + out_query = transliterate(query); + } if(out_query && out_query.length > 0) { input_term = out_query; diff --git a/views/elements/crawloptions_element.php b/views/elements/crawloptions_element.php index 26383333d..910aca795 100644 --- a/views/elements/crawloptions_element.php +++ b/views/elements/crawloptions_element.php @@ -159,287 +159,6 @@ class CrawloptionsElement extends Element <div class="center red"><?php e(tl('crawloptions_element_need_api_for_mix')); ?></div> <?php } ?> - - <script> - obj = document.getElementById("crawl-indexes"); - obj.onchange = function(){crawloptionsForm.submit();} - </script> - - <?php $data['logfields_type'] = array( - 1=>'IP_Address', - 2=>'Timestamp', - 3=>'URL', - 4=>'Status Code', - 5=>'User Agent', - 6=>'Request', - 7=>'Int'); - - $flag = false; - - /* If log files are selected as the option */ - if(isset($_POST['crawl_indexes']) - && $data['available_crawl_indexes'][$_POST['crawl_indexes']] - == 'ARCFILE::Log Files') { - $LogFolderPath = CRAWL_DIR.'/cache/archives'; - foreach(glob($LogFolderPath."/*") as $folder){ - if(is_dir($folder)){ - if(file_exists("$folder/arc_description.ini")){ - $contents = - file_get_contents("$folder/arc_description.ini"); - if(strpos($contents,"LogArchiveBundle") - == true){ - $flag = true; - $LogFolderPath = $folder; - } - } - } - if($flag == true) {break;} - } - - /*Get all the file names into an array*/ - $filenames = glob($LogFolderPath."/*.log"); - /*Retrieve the first filename*/ - $firstFile = $filenames[0]; - /*Split the file content into an array*/ - $l_delim = "\n"; - $file_array = explode($l_delim, file_get_contents($firstFile)); - echo "<br/><b>".tl('crawloptions_element_first_line_text')."</b><br/>"; - echo "<br/>".$file_array[0]."<br/>"; - - ?> - - <div id="Log_Records" class="top-margin"><b><?php - e(tl('crawloptions_element_log_records_details'))?></b></div> - - <table class="log-records-table"> - <tr><th><?php e(tl('crawloptions_element_field'));?></th> - <th><?php e(tl('crawloptions_element_field_name')); ?></th> - <th><?php e(tl('crawloptions_element_field_type')); ?></th></tr> - <?php - $i = 0; - foreach($data['LOG_RECORDS'] as $field => $field_nt) { - $matches = explode("::",$field_nt); - ?> - <tr><td class="input-field" > - <input - title="<?php e(tl('crawloptions_element_field')); ?>" - name="LOG_RECORDS[<?php e($i); ?>][FIELD]" - value="<?php e($field); ?>" - /> - </td> - <td class="input-field-name"> - <input - title="<?php e(tl('crawloptions_element_field_name')); ?>" - name="LOG_RECORDS[<?php e($i); ?>]['FIELD_NAME']" - value="<?php e($matches[0]); ?>" - /> - </td> - <td class="input-field-type" > - <?php $this->view->optionsHelper->render( - 'field-types', - "LOG_RECORDS[$i]['FIELD_TYPE']", - $data['logfields_type'], - $matches[1]); - ?> - </td> - </tr> - <?php - $i++; - } - if($i==0){ - ?> - <tr><td class="input-field"> - <input - type="text" - title="New Field" - name="LOG_RECORDS[<?php e($i); ?>][FIELD]" - value="" - /> - </td> - <td class="input-field-name"> - <input - type="text" - title="New Field Name" - name="LOG_RECORDS[<?php e($i); ?>]['FIELD_NAME']" - value="" - /> - </td> - <td class="input-field-type"> - <?php $this->view->optionsHelper->render( - 'field-types', - "LOG_RECORDS[$i]['FIELD_TYPE']", - $data['logfields_type'], - 1); - ?> - </td> - </tr> - - <?php } ?> - - <?php - if(isset($_POST['add_fields']) && $i>0){ - ?> - <tr> - <td class="input-field"> - <input - type="text" - title="New Field" - name="LOG_RECORDS[<?php e($i); ?>]['FIELD']" - value="" - /> - </td> - <td class="input-field-name"> - <input - type="text" - title="New Field Name" - name="LOG_RECORDS[<?php e($i); ?>]['FIELD_NAME']" - value="" - /> - </td> - <td class="input-field-type"> - <?php $this->view->optionsHelper->render( - 'field-types', - "LOG_RECORDS[$i]['FIELD_TYPE']", - $data['logfields_type'], - 1); - ?> - </td> - </tr> - - <?php } ?> - </table> - <?php - if(isset($_POST['save_options']) - && $data['available_crawl_indexes'][$_POST['crawl_indexes']] - == 'ARCFILE::Log Files'){ - file_put_contents($LogFolderPath."/fields_data.txt", - serialize($data['LOG_RECORDS'])); - } - ?> - <div class="log-record-new-field"> - <input - type="submit" - id="add-fields" - name="add_fields" - value="<?php e(tl('crawloptions_element_add_new_field')); ?>" - /> - </div> - <?php } ?> - - <?php - if(isset($_POST['crawl_indexes']) && - $data['available_crawl_indexes'][$_POST['crawl_indexes']] - == 'ARCFILE::Database files') { - $flag1 = false; - $DatabaseFolderPath = CRAWL_DIR.'/cache/archives'; - foreach(glob($DatabaseFolderPath."/*") as $folder){ - if(is_dir($folder)){ - if(file_exists("$folder/arc_description.ini")){ - $contents = - file_get_contents("$folder/arc_description.ini"); - if(strpos($contents,"Database files") == true){ - $flag1 = true; - $DatabaseFolderPath = $folder; - } - } - } - if($flag1 == true) {break;} - } - ?> - - <div id="Database_Connection_Details" class="top-margin"><b><?php - e(tl('crawloptions_element_database_connection_details'))?></b> - </div><br/> - - <table class="database-connection-details-table"> - <tr><td class="input-name"><?php - e(tl('crawloptions_element_hostname'))?> - </td> - <td class="input-data"> - <input - title="<?php e(tl('crawloptions_element_hostname')); ?>" - name="DATABASE_CONNECTION_DETAILS[HOSTNAME]" - value="" - /> - </td> - </tr> - <tr> - <td class="input-name"><?php - e(tl('crawloptions_element_username'))?> - </td> - <td class="input-data"> - <input - title="<?php e(tl('crawloptions_element_username')); ?>" - name="DATABASE_CONNECTION_DETAILS[USERNAME]" - value="" - /> - </td> - </tr> - <tr> - <td class="input-name"><?php - e(tl('crawloptions_element_password'))?> - </td> - <td class="input-data"> - <input - title="<?php e(tl('crawloptions_element_password')); ?>" - name="DATABASE_CONNECTION_DETAILS[PASSWORD]" - value="" - /> - </td> - </tr> - <tr> - <td class="input-name"><?php - e(tl('crawloptions_element_databasename'))?> - </td> - <td class="input-data"> - <input - title= "<?php e(tl('crawloptions_element_databasename')); ?>" - name="DATABASE_CONNECTION_DETAILS[DATABASENAME]" - value="" - /> - </td> - </tr> - <tr> - <td class="input-name"><?php - e(tl('crawloptions_element_query'))?> - </td> - <td class="input-data"> - <input - title="<?php e(tl('crawloptions_element_query')); ?>" - name="DATABASE_CONNECTION_DETAILS[QUERY]" - value="" - /> - </td> - </tr> - </table> - <div class="database-connection-details-submit"> - <input type="submit" - id="submit-details" - name="submit_details" - value="<?php e(tl('crawloptions_element_submit')); ?>" - /> - </div> - - <?php - if(isset($_POST['submit_details'])){ - file_put_contents($DatabaseFolderPath. - "/database_connection_details.txt", - serialize($data['DATABASE_CONNECTION_DETAILS'])); - } - ?> - <?php - if(isset($_POST['save_options']) - && $data['available_crawl_indexes'][$_POST['crawl_indexes']] - == 'ARCFILE::Database files'){ - file_put_contents($DatabaseFolderPath. - "/database_connection_details.txt", - serialize($data['DATABASE_CONNECTION_DETAILS'])); - } - ?> - - <?php } ?> - - </div> <?php } ?> </div>