diff --git a/bin/arc_tool.php b/bin/arc_tool.php index 6a7cbaab8..30ac97392 100755 --- a/bin/arc_tool.php +++ b/bin/arc_tool.php @@ -149,7 +149,7 @@ class ArcTool implements CrawlConstants if($path == $argv[2] && !file_exists($path)) { $path = CRAWL_DIR."/cache/".$path; if(!file_exists($path)) { - $path = CRAWL_DIR."/cache/archives/".$argv[2]; + $path = CRAWL_DIR."/archives/".$argv[2]; } } } @@ -223,7 +223,7 @@ class ArcTool implements CrawlConstants } } - $nonyioop_pattern = CRAWL_DIR."/cache/archives/*/arc_description.ini"; + $nonyioop_pattern = CRAWL_DIR."/archives/*/arc_description.ini"; $archives = glob($nonyioop_pattern); if(is_array($archives) && count($archives) > 0 ) { $archives_found = true; @@ -481,7 +481,7 @@ class ArcTool implements CrawlConstants */ function getArchiveName($archive_path) { - $start = CRAWL_DIR."/cache/archives/"; + $start = CRAWL_DIR."/archives/"; if(strstr($archive_path, $start)) { $start_len = strlen($start); $name = substr($archive_path, $start_len); @@ -846,7 +846,7 @@ php arc_tool.php info bundle_name php arc_tool.php list /* returns a list of all the archives in the Yioop! crawl directory, - including non-Yioop! archives in the cache/archives sub-folder.*/ + including non-Yioop! archives in the /archives sub-folder.*/ php arc_tool.php mergetiers bundle_name max_tier // merges tiers of word dictionary into one tier up to max_tier diff --git a/bin/classifier_tool.php b/bin/classifier_tool.php new file mode 100755 index 000000000..73d1dbe92 --- /dev/null +++ b/bin/classifier_tool.php @@ -0,0 +1,739 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage bin + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + +if(php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();} + +/** + * Calculate base directory of script + * @ignore + */ +define("BASE_DIR", substr( + dirname(realpath($_SERVER['PHP_SELF'])), 0, + -strlen("/bin"))); + +/** Load in global configuration settings */ +require_once BASE_DIR.'/configs/config.php'; +if(!PROFILE) { + echo "Please configure the search engine instance by visiting" . + "its web interface on localhost.\n"; + exit(); +} + +/** + * CRAWLING means don't try to use memcache + * @ignore + */ +define("NO_CACHE", true); + +/** + * Immediately throw an exception for all notices and warnings, rather than + * letting execution continue. + * @ignore + */ +function handleError($errno, $err_str, $err_file, $err_line) +{ + if (error_reporting() == 0) { + // Error suppressed by @, so ignore. + return; + } + $msg = "$err_str in $err_file on line $err_line"; + if ($errno == E_NOTICE || $errno == E_WARNING) { + throw new ErrorException($msg, $errno); + } else { + echo $msg; + } +} +set_error_handler('handleError'); + +/** To use and manipulate classifiers */ +require_once BASE_DIR."/lib/classifiers/classifier.php"; +/** To manipulate crawl mixes using the controller's methods */ +require_once BASE_DIR."/controllers/classifier_controller.php"; + +/* + * We'll set up multi-byte string handling to use UTF-8 + */ +mb_internal_encoding("UTF-8"); +mb_regex_encoding("UTF-8"); + +/** + * This class is used to automate the building and testing of classifiers, + * providing an alternative to the web interface when a labeled training set is + * available. + * + * ClassifierTool takes an activity to perform, the name of a dataset to use, + * and a label for the constructed classifier. The activity is the name of one + * of the 'run*' functions implemented by this class, without the common 'run' + * prefix (e.g., 'TrainAndTest'). The dataset is specified as the common prefix + * of two indexes that have the suffixes "Pos" and "Neg", respectively. So if + * the prefix were "DATASET", then this tool would look for the two existing + * indexes "DATASET Pos" and "DATASET Neg" from which to draw positive and + * negative examples. Each document in these indexes should be a positive or + * negative example of the target class, according to whether it's in the "Pos" + * or "Neg" index. Finally, the label is just the label to be used for the + * constructed classifier. + * + * Beyond these options (set with the -a, -d, and -l flags), a number of other + * options may be set to alter parameters used by an activity or a classifier. + * These options are set using the -S, -I, -F, and -B flags, which correspond + * to string, integer, float, and boolean parameters respectively. These flags + * may be used repeatedly, and each expects an argument of the form NAME=VALUE, + * where NAME is the name of a parameter, and VALUE is a value parsed according + * to the flag. The NAME should match one of the keys of the options member of + * this class, where a period ('.') may be used to specify nesting. For + * example: + * + * -I debug=1 # set the debug level to 1 + * -B cls.use_nb=0 # tell the classifier to use Naive Bayes + * + * To build and evaluate a classifier for the label 'spam', trained using the + * two indexes "DATASET Neg" and "DATASET Pos", and a maximum of the top 25 + * most informative features: + * + * php bin/classifier_tool.php -a TrainAndTest -d 'DATASET' -l 'spam' + * -I cls.chi2.max=25 + * + * @author Shawn Tice + * @package seek_quarry + */ +class ClassifierTool +{ + /** + * Reference to a classifier controller, used to manipulate crawl mixes in + * the same way that the controller that handles web requests does. + * @var object + */ + var $classifierController; + + /** + * Reference to a crawl model object, also used to manipulate crawl mixes. + * @var object + */ + var $crawlModel; + + /** + * Options to be used by activities and constructed classifiers. These + * options can be overridden by supplying an appropriate flag on the + * command line, where nesting is denoted by a period (e.g., cls.chi2.max). + * The supported options are: + * + * debug: An integer, the level of debug statements to print. Larger + * integers specify more detailed debug output; the default value of + * 0 indicates no debug output. + * + * max_train: An integer, the maximum number of examples to use when + * training a classifier. The default value of NULL indicates that + * all available training examples should be used. + * + * test_interval: An integer, the number of new training examples to be + * added before a round of testing on ALL test instances is to be + * executed. With an interval of 5, for example, after adding five + * new training examples, the classifier would be finalized and used + * to classify all test instances. The error is reported for each + * round of testing. The default value of NULL indicates that + * testing should only occur after all training examples have been + * added. + * + * split: An integer, the number of examples from the entire set of + * labeled examples to use for training. The remainder are used for + * testing. + * + * cls.use_nb: A boolean, whether or not to use the Naive Bayes + * classification algorithm instead of the logistic regression one + * in order to finalize the classifier. The default value is false, + * indicating that logistic regression should be used. + * + * cls.chi2.max: An integer, the maximum number of features to use when + * training the classifier. The default is a relatively + * conservative 200. + * + * @var array + */ + var $options = array( + 'debug' => 0, + 'max_train' => NULL, + 'test_interval' => NULL, + 'split' => 3000, + 'cls' => array( + 'use_nb' => false, + 'chi2' => array( + 'max' => 200))); + + /** + * Initializes the classifier controller and crawl model that will be used + * to manage crawl mixes, used for iterating over labeled examples. + */ + function __construct() + { + $this->classifierController = new ClassifierController(); + $this->crawlModel = $this->classifierController->crawlModel; + } + + /** + * Parses the command-line options, returns the required arguments, and + * updates the member variable $options with any parameters. If any of the + * required arguments (activity, dataset, or label) are missing, then a + * message is printed and the program exits. The optional arguments used to + * set parameters directly modify the class state through the setOptions + * method. + * + * @return array the parsed activity, dataset, and label + */ + function parseOptions() + { + $shortopts = 'l:a:d:S:I:F:B:'; + $options = getopt($shortopts); + if (!isset($options['a'])) { + echo "missing -a flag to choose activity to run\n"; + exit(1); + } + if (!isset($options['l'])) { + echo "missing -l flag to set classifier label\n"; + exit(1); + } + if (!isset($options['d'])) { + echo "missing -d flag to choose dataset to use\n"; + exit(1); + } + $activity = $options['a']; + $label = Classifier::cleanLabel($options['l']); + $dataset_name = $options['d']; + unset($options['a'], $options['l'], $options['d']); + foreach ($options as $opt_name => $value) { + switch ($opt_name) { + case 'S': + $this->setOptions($value); + break; + case 'I': + $this->setOptions($value, 'intval'); + break; + case 'F': + $this->setOptions($value, 'floatval'); + break; + case 'B': + $this->setOptions($value, 'boolval'); + break; + default: + echo "unsupported option: {$opt_name}\n"; + break; + } + } + return array($activity, $dataset_name, $label); + } + + /** + * Parses the options, and if an appropriate activity exists, calls the + * activity, passing in the label and dataset to be used; otherwise, prints + * an error and exits. + */ + function main() + { + list($activity, $dataset_name, $label) = $this->parseOptions(); + $method = "run{$activity}"; + if (method_exists($this, $method)) { + $this->$method($label, $dataset_name); + } else { + echo "no activity: {$activity}\n"; + exit(1); + } + } + + /* ACTIVITIES */ + + /** + * Trains a classifier on a data set, testing at the specified intervals. + * The testing interval is set by the test_interval parameter. Each time + * this activity is run a new classifier is created (replacing an old one + * with the same label, if necessary), and the classifier remains at the + * end. + * + * @param string $label class label of the new classifier + * @param string $dataset_name name of the dataset to train and test on + */ + function runTrainAndTest($label, $dataset_name) + { + $this->setDefault('max_train', 200); + $this->logOptions(); + $classifier = $this->makeFreshClassifier($label); + $data = $this->loadDataset($dataset_name, $label); + $classifier->initBuffer($data['train'], 0); + $pages = $data['train']; + $classifier->prepareToLabel(); + $end = min($this->options['max_train'], $pages->length); + for ($i = 1; $i <= $end; $i++) { + $page = $pages->nextPage(); + $doc_label = $page['TRUE_LABEL']; + $key = Classifier::makeKey($page); + $classifier->addBufferDoc($page, false); + $classifier->labelDocument($key, $doc_label, false); + if ($this->isTestPoint($i, $end)) { + Classifier::setClassifier($classifier); + $this->testClassifier($classifier, $data); + /* + Testing the classifier puts it into "classify" mode, which + will uses a different set of data from "label" mode, so it's + important to switch back. + */ + $classifier->prepareToLabel(); + } + } + } + + /** + * Like the TrainAndTest activity, but uses active training in order to + * choose the documents to add to the training set. The method simulates + * the process that an actual user would go through in order to label + * documents for addition to the training set, then tests performance at + * the specified intervals. + * + * @param string $label class label of the new classifier + * @param string $dataset_name name of the dataset to train and test on + */ + function runActiveTrainAndTest($label, $dataset_name) + { + $this->setDefault('max_train', 200); + $this->logOptions(); + $classifier = $this->makeFreshClassifier($label); + $data = $this->loadDataset($dataset_name, $label); + $pages = $data['train']; + $classifier->prepareToLabel(); + $classifier->initBuffer($pages); + $end = min($this->options['max_train'], $pages->length); + for ($i = 1; $i <= $end; $i++) { + list($new_doc, $disagreement) = + $classifier->findNextDocumentToLabel(); + if ($new_doc) { + $key = Classifier::makeKey($new_doc); + $doc_label = $new_doc['TRUE_LABEL']; + $classifier->labelDocument($key, $doc_label); + $classifier->refreshBuffer($pages); + $classifier->computeBufferDensities(); + $classifier->train(); + } + if ($this->isTestPoint($i, $end)) { + Classifier::setClassifier($classifier); + $this->testClassifier($classifier, $data); + $classifier->prepareToLabel(); + } + } + } + + /* UTILITY METHODS */ + + /** + * Creates a new classifier for a label, first deleting any existing + * classifier with the same label. + * + * @param string $label class label of the new classifier + * @return object created classifier instance + */ + function makeFreshClassifier($label) + { + if ($classifier = Classifier::getClassifier($label)) { + $this->deleteClassifier($label); + } + $classifier = new Classifier($label, $this->options['cls']); + Classifier::setClassifier($classifier); + return $classifier; + } + + /** + * Deletes an existing classifier, specified by its label. + * + * @param string $label class label of the existing classifier + */ + function deleteClassifier($label) + { + Classifier::deleteClassifier($label); + $mix_name = Classifier::getCrawlMixName($label); + $mix_time = $this->crawlModel->getCrawlMixTimestamp($mix_name); + if ($mix_time) { + $this->crawlModel->deleteCrawlMixIteratorState($mix_time); + $this->crawlModel->deleteCrawlMix($mix_time); + } + } + + /** + * Fetches the summaries for pages in the indices specified by the passed + * dataset name. This method looks for existing indexes with names matching + * the dataset name prefix, and with suffix either "pos" or "neg" (ignoring + * case). The pages in these indexes are shuffled into one large array, and + * augmented with a TRUE_LABEL field that records which set they came from + * originally. The shuffled array is then split according to the `split' + * option, and all pages up to (but not including) the split index are used + * for the training set; the remaining pages are used for the test set. + * + * @param string $dataset_name prefix of index names to draw examples from + * @param string $class_label class label of the classifier the examples + * will be used to train (used to name the crawl mix that iterates over + * each index) + * @return array training and test datasets in an associative array with + * keys `train' and `test', where each dataset is wrapped up in a + * PageIterator that implements the CrawlMixIterator interface. + */ + function loadDataset($dataset_name, $class_label) + { + $crawls = $this->crawlModel->getCrawlList(false, true, NULL); + $dataset_name = preg_quote($dataset_name); + $re = '/^RECRAWL::'.$dataset_name.' (pos|neg)$/i'; + $pages = array(); + foreach ($crawls as $crawl) { + if (!preg_match($re, $crawl['DESCRIPTION'], $groups)) { + continue; + } + $label = strtolower($groups[1]); + $doc_label = $label == 'pos' ? 1 : -1; + $mix_iterator = + $this->classifierController->buildClassifierCrawlMix( + $class_label, $crawl['CRAWL_TIME']); + while (!$mix_iterator->end_of_iterator) { + $new_pages = $mix_iterator->nextPages(5000); + /* + This field can be added to the results from a crawl mix + iterator, but we don't care about it, so we just discard it. + */ + if (isset($new_pages['NO_PROCESS'])) { + unset($new_pages['NO_PROCESS']); + } + foreach ($new_pages as $page) { + $page['TRUE_LABEL'] = $doc_label; + $pages[] = $page; + } + } + } + shuffle($pages); + if (count($pages) < $this->options['split']) { + echo "split is larger than dataset\n"; + exit(1); + } + $data = array(); + $data['train'] = new PageIterator( + array_slice($pages, 0, $this->options['split'])); + $data['test'] = new PageIterator( + array_slice($pages, $this->options['split'])); + return $data; + } + + /** + * Determines whether to run a classification test after a certain number + * of documents have been added to the training set. Whether or not to test + * is determined by the `test_interval' option, which may be either NULL, + * an integer, or a string. In the first case, testing only occurs after + * all training examples have been added; in the second case, testing + * occurs each time an additional constant number of training examples have + * been added; and in the final case, testing occurs on a fixed schedule of + * comma-separated offsets, such as "10,25,50,100". + * + * @param int $i the size of the current training set + * @param int $total the total number of documents available to be added to + * the training set + * @return bool true if the `test_interval' option specifies that a round + * of testing should occur for the current training offset, and false + * otherwise + */ + function isTestPoint($i, $total) + { + if (is_null($this->options['test_interval'])) { + return $i == $total; + } else if (is_int($this->options['test_interval'])) { + return $i % $this->options['test_interval'] == 0; + } else { + $re = '/(^|,)'.$i.'(,|$)/'; + return preg_match($re, $this->options['test_interval']); + } + } + + /** + * Finalizes the current classifier, uses it to classify all test + * documents, and logs the classification error. The current classifier is + * saved to disk after finalizing (though not before), and left in + * `classify' mode. The iterator over the test dataset is reset for the + * next round of testing (if any). + * + * @param object $classifier classifier instance to test + * @param array $data the array of training and test datasets, constructed + * by loadDataset, of which only the `test' dataset it used. + */ + function testClassifier($classifier, $data) + { + $classifier->prepareToFinalize(); + $classifier->finalize(); + Classifier::setClassifier($classifier); + $classifier->prepareToClassify(); + $wrong = 0; + $total = 0; + $pages = $data['test']; + while (!$pages->end_of_iterator) { + $page = $pages->nextPage(); + $score = $classifier->classify($page); + $page_label = $score >= 0.5 ? 1 : -1; + if ($page_label != $page['TRUE_LABEL']) { + $wrong++; + } + $total++; + } + $error = (float)$wrong / $total; + $this->log(0, 'error = %.4f', $error); + $pages->reset(); + } + + /** + * Writes out logging information according to a detail level. The first + * argument is an integer (potentially negative) indicating the level of + * detail for the log message, where larger numbers indicate greater + * detail. Each message is prefixed with a character according to its level + * of detail, but if the detail level is greater than the level specified + * by the `debug' option then nothing is printed. The treatment for the + * available detail levels are as follows: + * + * -2: Used for errors; always printed; prefix '! ' + * -1: Used for log of set options; always printed; prefix '# ' + * 0+: Used for normal messages; prefix '> ' + * + * The second argument is a printf-style string template specifying the + * message, and each following (optional) argument is used by the template. + * A newline is added automatically to each message. + * + * @param int $level level of detail for the message + * @param string $message printf-style template for the message + * @param string $args,... optional arguments to be used for the message + * template + */ + function log(/* varargs */) + { + $args = func_get_args(); + $level = array_shift($args); + if ($level > $this->options['debug']) { + return; + } + if ($level == -2) { + echo '! '; + } else if ($level == -1) { + echo '# '; + } else { + echo '> '; + } + call_user_func_array('printf', $args); + echo "\n"; + } + + /** + * Logs the current options using the log method of this class. This method + * is used to explicitly state which settings were used for a given run of + * an activity. The detail level passed to the log method is -1. + */ + function logOptions($root = NULL, $prefix = '') + { + if (is_null($root)) { + $root = $this->options; + } + foreach ($root as $key => $value) { + if (is_array($value)) { + $this->logOptions($value, $prefix.$key.'.'); + } else if (!is_null($value)) { + if ($value === false) $value = 'false'; + else if ($value === true) $value = 'true'; + $this->log(-1, '%s%s = %s', $prefix, $key, strval($value)); + } + } + } + + /** + * Sets one or more options of the form NAME=VALUE according to a converter + * such as intval, floatval, and so on. The options may be passed in either + * as a string (a single option) or as an array of strings, where each + * string corresponds to an option of the same type (e.g., int). + * + * @param string|array $opts single option in the format NAME=VALUE, or + * array of options, each for the same target type (e.g., int) + * @param string $converter the name of a function that takes a string and + * casts it to a particular type (e.g., intval, floatval) + */ + function setOptions($opts, $converter = NULL) + { + if (!is_array($opts)) { + $opts = array($opts); + } + foreach ($opts as $opt) { + $split = strpos($opt, '='); + $name = substr($opt, 0, $split); + $value = substr($opt, $split + 1); + if ($converter) { + if ($converter == 'boolval' && !function_exists('boolval')) { + $value = (bool)$value; + } else { + $value = call_user_func($converter, $value); + } + } + $fields = explode('.', $name); + $field =& $this->options; + while (!empty($fields)) { + $top = array_shift($fields); + if (array_key_exists($top, $field)) { + $field =& $field[$top]; + } else { + $this->log(-2, 'unknown option: "%s"', $name); + break; + } + } + if (empty($fields)) { + $field = $value; + } + } + } + + /** + * Sets a default value for a runtime parameter. This method is used by + * activities to specify default values that may be overridden by passing + * the appropriate command-line flag. + */ + function setDefault($name, $value) + { + $fields = explode('.', $name); + $field =& $this->options; + while (count($fields) > 1) { + $top = array_shift($fields); + $field =& $field[$top]; + } + $last = array_shift($fields); + if (!isset($field[$last])) { + $field[$last] = $value; + } + } +} + + +/** + * This class provides the same interface as an iterator over crawl mixes, but + * simply iterates over an array. + * + * This is used to gather all of the pages for a training set in one go (using + * a crawl mix iterator), then repeatedly iterate over them in memory, as + * though they were coming from the original crawl mix iterator. + * + * @author Shawn Tice + * @package seek_quarry + */ +class PageIterator +{ + /** + * The array of pages to repeatedly iterate over. + * @var array + */ + var $pages; + + /** + * The total number of pages. + * @var int + */ + var $length; + + /** + * The current offset into the wrapped array. + * @var int + */ + var $pos; + + /** + * Whether or not the last page has been reached. + * @var bool + */ + var $end_of_iterator; + + /** + * Establishes a new iterator over a (potentially empty) array of pages. + * + * @param array $pages standard array of pages to iterate over + */ + function __construct($pages) + { + $this->pages = $pages; + $this->length = count($pages); + $this->reset(); + } + + /** + * Resets the iterator so that the next page will be the first. + */ + function reset() + { + $this->pos = 0; + $this->end_of_iterator = $this->length == 0; + } + + /** + * Returns up to the requested number of next pages, potentially an empty + * array if there are no pages left. This method updates the + * `end_of_iterator' flag according to whether the last page has been + * returned. + * + * @param int $n maximum number of pages to return, or -1 to return all + * remaining pages + * @return array next $n pages, or less if there are fewer than $n + * pages remaining + */ + function nextPages($n = -1) + { + if ($n == -1) { + $n = $this->length - $this->pos; + } else { + $n = min($this->length - $this->pos, $n); + } + $start = $this->pos; + $this->pos += $n; + if ($this->pos == $this->length) { + $this->end_of_iterator = true; + } + return array_slice($this->pages, $start, $n); + } + + /** + * Behaves like nextPages, but returns just the next page (not wrapped in + * an array) if there is one, and NULL otherwise. + * + * @return array next page if available, and NULL otherwise + */ + function nextPage() + { + $next = $this->nextPages(1); + return !empty($next) ? $next[0] : NULL; + } +} + +try { + $classifier_tool = new ClassifierTool(); + $classifier_tool->main(); +} catch (ErrorException $e) { + echo $e . "\n"; +} +?> \ No newline at end of file diff --git a/bin/classifier_trainer.php b/bin/classifier_trainer.php new file mode 100755 index 000000000..4e5b170b4 --- /dev/null +++ b/bin/classifier_trainer.php @@ -0,0 +1,125 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage bin + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + + +if(php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();} + +/* + Calculate base directory of script + */ +define("BASE_DIR", substr( + dirname(realpath($_SERVER['PHP_SELF'])), 0, + -strlen("/bin"))); + +/* + We must specify that we want logging enabled + */ +define("NO_LOGGING", false); + +/* + Load in global configuration settings + */ +require_once BASE_DIR.'/configs/config.php'; +if(!PROFILE) { + echo "Please configure the search engine instance by visiting" . + "its web interface on localhost.\n"; + exit(); +} + +/** Used to initialize and terminate the daemon */ +require_once BASE_DIR."/lib/crawl_daemon.php"; +/** Used to create, update, and delete user-trained classifiers. */ +require_once BASE_DIR."/lib/classifiers/classifier.php"; + +/* + We'll set up multi-byte string handling to use UTF-8 + */ +mb_internal_encoding("UTF-8"); +mb_regex_encoding("UTF-8"); + + +/* + If possible, set the memory limit high enough to fit all of the features and + training documents into memory. + */ +ini_set("memory_limit", "500M"); + + +/** + * This class is used to finalize a classifier via the web interface. + * + * Because finalizing involves training a logistic regression classifier on a + * potentially-large set of training examples, it can take much longer than + * would be allowed by the normal web execution time limit. So instead of + * trying to finalize a classifier directly in the controller that handles the + * web request, the controller kicks off a daemon that simply loads the + * classifier, finalizes it, and saves it back to disk. + * + * The classifier to finalize is specified by its class label, passed as the + * second command-line argument. The following command would be used to run + * this script directly from the command-line: + * + * $ php bin/classifier_trainer.php terminal LABEL + * + * @author Shawn Tice + * @package seek_quarry + */ +class ClassifierTrainer +{ + /** + * This is the function that should be called to get the + * classifier_trainer to start training a logistic regression instance for + * a particular classifier. The class label corresponding to the + * classifier to be finalized should be passed as the second command-line + * argument. + */ + function start() + { + global $argv; + CrawlDaemon::init($argv, "classifier_trainer"); + $label = $argv[2]; + crawlLog("Initializing classifier trainer log..", + $label.'-classifier_trainer'); + $classifier = Classifier::getClassifier($label); + $classifier->prepareToFinalize(); + $classifier->finalize(); + Classifier::setClassifier($classifier); + crawlLog("Training complete.\n"); + CrawlDaemon::stop('classifier_trainer', $label); + } +} + +$classifier_trainer = new ClassifierTrainer(); +$classifier_trainer->start(); + +?> \ No newline at end of file diff --git a/bin/fetcher.php b/bin/fetcher.php index 0dcfddc8d..a616603db 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -90,6 +90,8 @@ require_once BASE_DIR."/lib/url_parser.php"; require_once BASE_DIR."/lib/phrase_parser.php"; /** For user-defined processing on page summaries*/ require_once BASE_DIR."/lib/page_rule_parser.php"; +/** For user-trained classification of page summaries*/ +require_once BASE_DIR."/lib/classifiers/classifier.php"; /** for crawlHash and crawlLog */ require_once BASE_DIR."/lib/utility.php"; /** for crawlDaemon function */ @@ -1109,6 +1111,8 @@ class Fetcher implements CrawlConstants } else { $info[self::ARC_DATA] = $pages; } + } else if(isset($info['ARCHIVE_BUNDLE_ERROR'])) { + crawlLog(" ".$info['ARCHIVE_BUNDLE_ERROR']); } crawlLog("Time to fetch archive data from name server ". @@ -1193,7 +1197,26 @@ class Fetcher implements CrawlConstants $this->$field = $info[$info_field]; } } - + if(!empty($info[self::ACTIVE_CLASSIFIERS_DATA])){ + /* + The classifier data is set by the fetch controller for each + active classifier, and is a compressed, serialized structure + containing all of the objects needed for classification. + */ + $classifiers_data = $info[self::ACTIVE_CLASSIFIERS_DATA]; + $this->classifiers = array(); + foreach ($classifiers_data as $label => $classifier_data) { + if ($classifier_data) { + $classifier = Classifier::newClassifierFromData( + $classifier_data); + $this->classifiers[] = $classifier; + crawlLog("Classifying with '{$label}' classifier."); + } else { + crawlLog("Skipping classifier '{$label}'; missing ". + "finalized data."); + } + } + } if(isset($info[self::PAGE_RULES]) ){ $rule_string = implode("\n", $info[self::PAGE_RULES]); $rule_string = html_entity_decode($rule_string, ENT_QUOTES); @@ -1547,6 +1570,17 @@ class Fetcher implements CrawlConstants isset($site[self::ROBOT_PATHS])) { $summarized_site_pages[$i][self::JUST_METAS] = true; } + if(isset($site[self::DOC_INFO][self::META_WORDS])) { + if (!isset($summarized_site_pages[$i][self::META_WORDS])) { + $summarized_site_pages[$i][self::META_WORDS] = + $site[self::DOC_INFO][self::META_WORDS]; + } else { + $summarized_site_pages[$i][self::META_WORDS] = + array_merge( + $summarized_site_pages[$i][self::META_WORDS], + $site[self::DOC_INFO][self::META_WORDS]); + } + } if(isset($site[self::DOC_INFO][self::LANG])) { if($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") { @@ -1580,6 +1614,10 @@ class Fetcher implements CrawlConstants $this->page_rule_parser->executeRuleTrees( $summarized_site_pages[$i]); } + if(!empty($this->classifiers)) { + Classifier::labelPage($summarized_site_pages[$i], + $this->classifiers); + } $i++; } } // end for diff --git a/configs/createdb.php b/configs/createdb.php index bff6681bd..3721eb3e1 100755 --- a/configs/createdb.php +++ b/configs/createdb.php @@ -80,7 +80,7 @@ if(!$profile_model->createDatabaseTables($db, $dbinfo)) { exit(); } -$db->execute("INSERT INTO VERSION VALUES (15)"); +$db->execute("INSERT INTO VERSION VALUES (16)"); //default account is root without a password $sql ="INSERT INTO USER VALUES (1, 'root', '".crawlCrypt('')."' ) "; @@ -112,7 +112,8 @@ $db->execute("INSERT INTO LOCALE VALUES ( 18, 'hi', 'हिन्दी', 'lr-tb')"); $db->execute("INSERT INTO LOCALE VALUES (19, 'tr', 'Türkçe', 'lr-tb')"); $db->execute("INSERT INTO LOCALE VALUES (20, 'fa', 'فارسی', 'rl-tb')"); -$db->execute("INSERT INTO LOCALE VALUES (21, 'te', 'తెలుగు', 'lr-tb')"); +$db->execute("INSERT INTO LOCALE VALUES (21, 'te', + 'తెలుగు', 'lr-tb')"); $sql ="INSERT INTO ROLE VALUES (1, 'Admin' ) "; $db->execute($sql); @@ -128,31 +129,36 @@ $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 8)"); $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 9)"); $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 10)"); $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 11)"); +$db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 12)"); $db->execute("INSERT INTO ACTIVITY VALUES (1, 1, 'manageAccount')"); $db->execute("INSERT INTO ACTIVITY VALUES (2, 2, 'manageUsers')"); $db->execute("INSERT INTO ACTIVITY VALUES (3, 3, 'manageRoles')"); $db->execute("INSERT INTO ACTIVITY VALUES (4, 4, 'manageCrawls')"); $db->execute("INSERT INTO ACTIVITY VALUES (5, 5, 'mixCrawls')"); -$db->execute("INSERT INTO ACTIVITY VALUES (6, 6, 'pageOptions')"); -$db->execute("INSERT INTO ACTIVITY VALUES (7, 7, 'resultsEditor')"); -$db->execute("INSERT INTO ACTIVITY VALUES (8, 8, 'searchSources')"); -$db->execute("INSERT INTO ACTIVITY VALUES (9, 9, 'manageMachines')"); -$db->execute("INSERT INTO ACTIVITY VALUES (10, 10, 'manageLocales')"); -$db->execute("INSERT INTO ACTIVITY VALUES (11, 11, 'configure')"); +$db->execute("INSERT INTO ACTIVITY VALUES (6, 6, 'manageClassifiers')"); +$db->execute("INSERT INTO ACTIVITY VALUES (7, 7, 'pageOptions')"); +$db->execute("INSERT INTO ACTIVITY VALUES (8, 8, 'resultsEditor')"); +$db->execute("INSERT INTO ACTIVITY VALUES (9, 9, 'searchSources')"); +$db->execute("INSERT INTO ACTIVITY VALUES (10, 10, 'manageMachines')"); +$db->execute("INSERT INTO ACTIVITY VALUES (11, 11, 'manageLocales')"); +$db->execute("INSERT INTO ACTIVITY VALUES (12, 12, 'configure')"); $db->execute("INSERT INTO TRANSLATION VALUES (1,'db_activity_manage_account')"); $db->execute("INSERT INTO TRANSLATION VALUES (2, 'db_activity_manage_users')"); $db->execute("INSERT INTO TRANSLATION VALUES (3, 'db_activity_manage_roles')"); $db->execute("INSERT INTO TRANSLATION VALUES (4, 'db_activity_manage_crawl')"); $db->execute("INSERT INTO TRANSLATION VALUES (5, 'db_activity_mix_crawls')"); -$db->execute("INSERT INTO TRANSLATION VALUES (6, 'db_activity_file_options')"); -$db->execute("INSERT INTO TRANSLATION VALUES (7,'db_activity_results_editor')"); -$db->execute("INSERT INTO TRANSLATION VALUES(8,'db_activity_search_services')"); -$db->execute("INSERT INTO TRANSLATION VALUES(9,'db_activity_manage_machines')"); -$db->execute("INSERT INTO TRANSLATION VALUES (10, +$db->execute("INSERT INTO TRANSLATION VALUES (6, + 'db_activity_manage_classifiers')"); +$db->execute("INSERT INTO TRANSLATION VALUES (7, 'db_activity_file_options')"); +$db->execute("INSERT INTO TRANSLATION VALUES (8,'db_activity_results_editor')"); +$db->execute("INSERT INTO TRANSLATION VALUES(9,'db_activity_search_services')"); +$db->execute("INSERT INTO TRANSLATION VALUES(10, + 'db_activity_manage_machines')"); +$db->execute("INSERT INTO TRANSLATION VALUES (11, 'db_activity_manage_locales')"); -$db->execute("INSERT INTO TRANSLATION VALUES (11, 'db_activity_configure')"); +$db->execute("INSERT INTO TRANSLATION VALUES (12, 'db_activity_configure')"); $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (1, 1, 'Manage Account' )"); @@ -160,12 +166,14 @@ $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (2, 1, 'Manage Users')"); $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (3, 1, 'Manage Roles')"); $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (4, 1, 'Manage Crawls')"); $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (5, 1, 'Mix Crawls')"); -$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (6, 1, 'Page Options')"); -$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (7, 1, 'Results Editor')"); -$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (8, 1, 'Search Sources')"); -$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (9, 1, 'Manage Machines')"); -$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (10, 1, 'Manage Locales')"); -$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (11, 1, 'Configure')"); +$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (6, 1, 'Classifiers')"); +$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (7, 1, 'Page Options')"); +$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (8, 1, 'Results Editor')"); +$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (9, 1, 'Search Sources')"); +$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (10, + 1, 'Manage Machines')"); +$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (11, 1, 'Manage Locales')"); +$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (12, 1, 'Configure')"); $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (1, 5, 'Modifier votre compte' )"); diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php index 5296110cd..9dcf19733 100755 --- a/controllers/admin_controller.php +++ b/controllers/admin_controller.php @@ -41,6 +41,8 @@ require_once BASE_DIR."/lib/crawl_constants.php"; require_once BASE_DIR."/lib/url_parser.php"; /** Used in rule parser test in page options */ require_once BASE_DIR."/lib/page_rule_parser.php"; +/** Used to create, update, and delete user-trained classifiers. */ +require_once BASE_DIR."/lib/classifiers/classifier.php"; /** Loads crawl_daemon to manage news_updater */ require_once BASE_DIR."/lib/crawl_daemon.php"; /** get processors for different file types */ @@ -79,9 +81,9 @@ class AdminController extends Controller implements CrawlConstants * @var array */ var $activities = array("signin", "manageAccount", "manageUsers", - "manageRoles", "manageCrawls", "pageOptions", "resultsEditor", - "manageMachines", "manageLocales", "crawlStatus", "mixCrawls", - "machineStatus", "searchSources", "configure"); + "manageRoles", "manageCrawls", "pageOptions", "manageClassifiers", + "resultsEditor", "manageMachines", "manageLocales", "crawlStatus", + "mixCrawls", "machineStatus", "searchSources", "configure"); /** * An array of activities which are periodically updated within other * activities that they live. For example, within manage crawl, @@ -930,18 +932,22 @@ class AdminController extends Controller implements CrawlConstants $crawl_params[self::DISALLOWED_SITES] = isset($seed_info['disallowed_sites']['url']) ? $seed_info['disallowed_sites']['url'] : array(); - $crawl_params[self::PAGE_RULES] = - isset($seed_info['page_rules']['rule']) ? - $seed_info['page_rules']['rule'] : array(); - - if(isset($seed_info['indexing_plugins']['plugins'])) { - $crawl_params[self::INDEXING_PLUGINS] = - $seed_info['indexing_plugins']['plugins']; - } if(isset($seed_info['indexed_file_types']['extensions'])) { $crawl_params[self::INDEXED_FILE_TYPES] = $seed_info['indexed_file_types']['extensions']; } + if(isset($seed_info['active_classifiers']['label'])) { + // Note that 'label' is actually an array of active class labels. + $crawl_params[self::ACTIVE_CLASSIFIERS] = + $seed_info['active_classifiers']['label']; + } + if(isset($seed_info['indexing_plugins']['plugins'])) { + $crawl_params[self::INDEXING_PLUGINS] = + $seed_info['indexing_plugins']['plugins']; + } + $crawl_params[self::PAGE_RULES] = + isset($seed_info['page_rules']['rule']) ? + $seed_info['page_rules']['rule'] : array(); } /** @@ -1025,7 +1031,7 @@ class AdminController extends Controller implements CrawlConstants $seed_info = $this->crawlModel->getSeedInfo(); } $page_options_properties = array('indexed_file_types', - 'page_rules', 'indexing_plugins'); + 'active_classifiers', 'page_rules', 'indexing_plugins'); //these properties should be changed under page_options not here foreach($page_options_properties as $property) { if(isset($seed_current[$property])) { @@ -1437,9 +1443,6 @@ class AdminController extends Controller implements CrawlConstants * * This activity allows a user to specify the page range size to be * be used during a crawl as well as which file types can be downloaded - * - * @return array $data info about the groups and their contents for a - * particular crawl mix */ function pageOptions() { @@ -1621,6 +1624,30 @@ class AdminController extends Controller implements CrawlConstants } $seed_info["indexed_file_types"]["extensions"] = $filetypes; + $data['CLASSIFIERS'] = array(); + $active_classifiers = array(); + foreach (Classifier::getClassifierList() as $classifier) { + $label = $classifier->class_label; + $ison = false; + if (isset($_REQUEST['classifier']) && !$loaded) { + if (isset($_REQUEST['classifier'][$label])) { + $ison = true; + } + } else if (isset($seed_info['active_classifiers']['label'])) { + if (in_array($label, + $seed_info['active_classifiers']['label'])) { + $ison = true; + } + } + if ($ison) { + $data['CLASSIFIERS'][$label] = 'checked="checked"'; + $active_classifiers[] = $label; + } else { + $data['CLASSIFIERS'][$label] = ''; + } + } + $seed_info['active_classifiers']['label'] = $active_classifiers; + if(isset($seed_info['page_rules']['rule'])) { $data['page_rules'] = $this->convertArrayLines( $seed_info['page_rules']['rule']); @@ -1773,6 +1800,207 @@ class AdminController extends Controller implements CrawlConstants return $data; } + /** + * Handles admin requests for creating, editing, and deleting classifiers. + * + * This activity implements the logic for the page that lists existing + * classifiers, including the actions that can be performed on them. + */ + function manageClassifiers() + { + $possible_arguments = array('createclassifier', 'editclassifier', + 'finalizeclassifier', 'deleteclassifier'); + + $data['ELEMENT'] = 'manageclassifiersElement'; + $data['SCRIPT'] = ''; + + $machine_urls = $this->machineModel->getQueueServerUrls(); + $num_machines = count($machine_urls); + if ($num_machines < 1 || ($num_machines == 1 && + UrlParser::isLocalhostUrl($machine_urls[0]))) { + $machine_urls = NULL; + } + + $data['leftorright'] = + (getLocaleDirection() == 'ltr') ? 'right': 'left'; + + $classifiers = Classifier::getClassifierList(); + + if (isset($_REQUEST['arg']) && + in_array($_REQUEST['arg'], $possible_arguments)) { + $label = $this->clean($_REQUEST['class_label'], 'string'); + $label = Classifier::cleanLabel($label); + switch ($_REQUEST['arg']) + { + case 'createclassifier': + if (!isset($classifiers[$label])) { + $classifier = new Classifier($label); + Classifier::setClassifier($classifier); + $classifiers[$label] = $classifier; + $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">". + tl('admin_controller_new_classifier').'</h1>\');'; + } else { + $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">". + tl('admin_controller_classifier_exists'). + '</h1>\');'; + } + break; + + case 'editclassifier': + if (isset($classifiers[$label])) { + $data['class_label'] = $label; + $this->editClassifier($data, $classifiers, + $machine_urls); + } else { + $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">". + tl('admin_controller_no_classifier'). + '</h1>\');'; + } + break; + + case 'finalizeclassifier': + /* + Finalizing is too expensive to be done directly in the + controller that responds to the web request. Instead, a + daemon is launched to finalize the classifier + asynchronously and save it back to disk when it's done. + In the meantime, a flag is set to indicate the current + finalizing state. + */ + CrawlDaemon::start("classifier_trainer", $label, '', -1); + $classifier = $classifiers[$label]; + $classifier->finalized = Classifier::FINALIZING; + $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">". + tl('admin_controller_finalizing_classifier'). + '</h1>\');'; + break; + + case 'deleteclassifier': + /* + In addition to deleting the classifier, we also want to + delete the associated crawl mix (if one exists) used to + iterate over existing indexes in search of new training + examples. + */ + if (isset($classifiers[$label])) { + unset($classifiers[$label]); + Classifier::deleteClassifier($label); + $mix_name = Classifier::getCrawlMixName($label); + $mix_time = $this->crawlModel->getCrawlMixTimestamp( + $mix_name); + if ($mix_time) { + $this->crawlModel->deleteCrawlMixIteratorState( + $mix_time); + $this->crawlModel->deleteCrawlMix($mix_time); + } + $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">". + tl('admin_controller_classifier_deleted'). + '</h1>\');'; + } else { + $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">". + tl('admin_controller_no_classifier'). + '</h1>\');'; + } + break; + } + } + + $data['classifiers'] = $classifiers; + return $data; + } + + /** + * Handles the particulars of editing a classifier, which includes changing + * its label and adding training examples. + * + * This activity directly handles changing the class label, but not adding + * training examples. The latter activity is done interactively without + * reloading the page via XmlHttpRequests, coordinated by the classifier + * controller dedicated to that task. + * + * @param array $data data to be passed on to the view + * @param array $classifiers map from class labels to their associated + * classifiers + * @param array $machine_urls string urls of machines managed by this + * Yioop name server + */ + function editClassifier(&$data, $classifiers, $machine_urls) + { + $data['ELEMENT'] = 'editclassifierElement'; + $data['INCLUDE_SCRIPTS'] = array('classifiers'); + + // We want recrawls, but not archive crawls. + $crawls = $this->crawlModel->getCrawlList(false, true, $machine_urls); + $data['CRAWLS'] = $crawls; + + $classifier = $classifiers[$data['class_label']]; + + if (isset($_REQUEST['update']) && $_REQUEST['update'] == 'update') { + if (isset($_REQUEST['rename_label'])) { + $new_label = $this->clean($_REQUEST['rename_label'], 'string'); + $new_label = preg_replace('/[^a-zA-Z0-9_]/', '', $new_label); + if (!isset($classifiers[$new_label])) { + $old_label = $classifier['label']; + $classifier['label'] = $new_label; + Classifier::setClassifier($classifier); + Classifier::deleteClassifier($old_label); + $data['class_label'] = $new_label; + } else { + $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">". + tl('admin_controller_classifier_exists'). + '</h1>\');'; + } + } + } + + $data['classifier'] = $classifier; + + // Translations for the classification javascript. + $data['SCRIPT'] .= "window.tl = {". + 'editclassifier_load_failed:"'. + tl('editclassifier_load_failed').'",'. + 'editclassifier_loading:"'. + tl('editclassifier_loading').'",'. + 'editclassifier_added_examples:"'. + tl('editclassifier_added_examples').'",'. + 'editclassifier_label_update_failed:"'. + tl('editclassifier_label_update_failed').'",'. + 'editclassifier_updating:"'. + tl('editclassifier_updating').'",'. + 'editclassifier_acc_update_failed:"'. + tl('editclassifier_acc_update_failed').'",'. + 'editclassifier_na:"'. + tl('editclassifier_na').'",'. + 'editclassifier_no_docs:"'. + tl('editclassifier_no_docs').'",'. + 'editclassifier_num_docs:"'. + tl('editclassifier_num_docs').'",'. + 'editclassifier_in_class:"'. + tl('editclassifier_in_class').'",'. + 'editclassifier_not_in_class:"'. + tl('editclassifier_not_in_class').'",'. + 'editclassifier_skip:"'. + tl('editclassifier_skip').'",'. + 'editclassifier_prediction:"'. + tl('editclassifier_prediction').'",'. + 'editclassifier_scores:"'. + tl('editclassifier_scores').'"'. + '};'; + + /* + We pass along authentication information to the client, so that it + can authenticate any XmlHttpRequests that it makes in order to label + documents. + */ + $time = strval(time()); + $session = md5($time.AUTH_KEY); + $data['SCRIPT'] .= + "Classifier.initialize(". + "'{$data['class_label']}',". + "'{$session}',". + "'{$time}');"; + } + /** * Handles admin request related to the search filter activity * diff --git a/controllers/classifier_controller.php b/controllers/classifier_controller.php new file mode 100644 index 000000000..626e5a79a --- /dev/null +++ b/controllers/classifier_controller.php @@ -0,0 +1,351 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage controller + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** Load base controller class if needed */ +require_once BASE_DIR."/controllers/controller.php"; +/** Loads common constants for web crawling */ +require_once BASE_DIR."/lib/crawl_constants.php"; +/** Loads url_parser to clean resource name */ +require_once BASE_DIR."/lib/url_parser.php"; +/** Loads utilities for webencode and decode. */ +require_once BASE_DIR."/lib/utility.php"; +/** Loads the classifier library for managing user-trained classifiers */ +require_once BASE_DIR."/lib/classifiers/classifier.php"; +/** Loads mix archive iterator to iterate through mixes for classification */ +require_once BASE_DIR."/lib/archive_bundle_iterators/". + "mix_archive_bundle_iterator.php"; + +/** + * This class handles XmlHttpRequests to label documents during classifier + * construction. + * + * Searching for new documents to label and add to the training set is a + * heavily-interactive operation, so it is implemented using asynchronous + * requests to this controller in order to fetch candidates for labeling and + * add labels without reloading the classifier edit page. The admin controller + * takes care of first displaying the "edit classifier" page, and handles + * requests to change a classifier's class label, but this controller handles + * the other asynchronous requests issued by the JavaScript on the page. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage controller + */ +class ClassifierController extends Controller implements CrawlConstants +{ + /** + * Models used by this controller + * @var array + */ + var $models = array("crawl", "phrase"); + + /** + * Only outputs JSON data so don't need view + * @var array + */ + var $views = array(); + + /** + * These are the activities supported by this controller + * @var array + */ + var $activities = array("classify"); + + + /** + * Checks that the request seems to be coming from a legitimate, logged-in + * user, then dispatches to the appropriate activity. + */ + function processRequest() + { + if(!isset($_REQUEST['a']) || !$this->checkRequest()) {return;} + $activity = $_REQUEST['a']; + if(in_array($activity, $this->activities)) {$this->$activity();} + } + + /** + * Finds the next document for which to request a label, sometimes first + * recording the label that the user selected for the last document. This + * method should only be called via an XmlHttpRequest initiated by the edit + * classifier JavaScript, and consequently it always writes out + * JSON-encoded data, which is easily decoded by the page JavaScript. + */ + function classify() + { + $arg = $this->clean($_REQUEST['arg'], 'string'); + $label = $this->clean($_REQUEST['label'], 'string'); + + if (isset($_REQUEST['index'])) { + $index = $this->clean($_REQUEST['index'], 'int'); + if (intval($index) == 1) { + // TODO Fail in case that there's no current index + $index = $this->crawlModel->getCurrentIndexDatabaseName(); + } + $source_type = $this->clean($_REQUEST['type'], 'string'); + $keywords = $this->clean($_REQUEST['keywords'], 'string'); + } + + /* + The call to prepareToLabel is important; it loads all of the data + required to manage the training set from disk, and also determines + what will be saved *back* to disk later. + */ + $classifier = Classifier::getClassifier($label); + $classifier->prepareToLabel(); + + $data = array(); + + switch ($arg) + { + case 'getdocs': + /* + Load documents in from a user-specified index, and find the + next best one to label (for 'manual' source type), or label + them all with a single label (for either the 'positive' or + 'negative' source types). + */ + $mix_iterator = $this->buildClassifierCrawlMix( + $label, $index, $keywords); + if ($source_type == 'manual') { + $num_docs = $classifier->initBuffer($mix_iterator); + $classifier->computeBufferDensities(); + $data['num_docs'] = $num_docs; + list($new_doc, $disagreement) = + $classifier->findNextDocumentToLabel(); + if ($new_doc) { + $score = $classifier->classify($new_doc); + $data['new_doc'] = $this->prepareUnlabelledDocument( + $new_doc, $score, $disagreement, + $index, $keywords); + } + Classifier::setClassifier($classifier); + } else if ($source_type == 'positive' || + $source_type == 'negative') { + $doc_label = ($source_type == 'positive') ? 1 : -1; + $add_count = $classifier->addAllDocuments( + $mix_iterator, $doc_label); + if ($add_count > 0) { + /* + Pass true to always update accuracy after adding a + batch of documents all at once. + */ + $classifier->train(true); + Classifier::setClassifier($classifier); + } + $data['add_count'] = $add_count; + } + break; + + case 'addlabel': + /* + First label the last candidate document presented to the + user (potentially skipping it instead of actually applying a + label), then pick the next best candidate for labeling. + When skipping a document instead of adding a label, avoid + re-training since the training set hasn't actually changed. + */ + $doc = $_REQUEST['doc_to_label']; + $docid = $this->clean($doc['docid'], 'int'); + $key = webdecode($this->clean($doc['key'], 'string')); + $doc_label = $this->clean($doc['label'], 'int'); + $mix_iterator = $this->retrieveClassifierCrawlMix($label); + $labels_changed = $classifier->labelDocument($key, $doc_label); + $num_docs = $classifier->refreshBuffer($mix_iterator); + $classifier->computeBufferDensities(); + $data['num_docs'] = $num_docs; + if ($labels_changed) { + $update_accuracy = $classifier->total > 0 && + $classifier->total % 10 == 0; + $classifier->train($update_accuracy); + } + list($new_doc, $disagreement) = + $classifier->findNextDocumentToLabel(); + if ($new_doc) { + $score = $classifier->classify($new_doc); + $data['new_doc'] = $this->prepareUnlabelledDocument( + $new_doc, $score, $disagreement, + $index, $keywords); + } + Classifier::setClassifier($classifier); + break; + + case 'updateaccuracy': + /* + Don't do anything other than re-compute the accuracy for the + current training set. + */ + $classifier->updateAccuracy(); + Classifier::setClassifier($classifier); + break; + } + + /* + No matter which activity we ended up carrying out, always include + the statistics that *might* have changed so that the client can just + naively keep them up to date. + */ + $data['positive'] = $classifier->positive; + $data['negative'] = $classifier->negative; + $data['total'] = $classifier->total; + $data['accuracy'] = $classifier->accuracy; + + /* + Pass along a new authentication token so that the client can make a + new authenticated request after this one. + */ + $data['authTime'] = strval(time()); + $data['authSession'] = md5($data['authTime'].AUTH_KEY); + + $response = json_encode($data); + header("Content-Type: application/json"); + header("Content-Length: ".strlen($response)); + echo $response; + } + + /* PRIVATE METHODS */ + + /** + * Creates a new crawl mix for an existing index, with an optional query, + * and returns an iterator for the mix. The crawl mix name is derived from + * the class label, so that it can be easily retrieved and deleted later + * on. + * + * @param string $label class label of the classifier the new crawl mix + * will be associated with + * @param int $crawl_time timestamp of the index to be iterated over + * @param string $keywords an optional query used to restrict the pages + * retrieved by the crawl mix + * @return object A MixArchiveBundleIterator instance that will iterate + * over the pages of the requested index + */ + function buildClassifierCrawlMix($label, $crawl_time, $keywords) + { + $mix_time = time(); + $mix_name = Classifier::getCrawlMixName($label); + + // Replace any existing crawl mix. + $old_time = $this->crawlModel->getCrawlMixTimestamp($mix_name); + if ($old_time) { + $this->crawlModel->deleteCrawlMixIteratorState($old_time); + $this->crawlModel->deleteCrawlMix($old_time); + } + + $this->crawlModel->setCrawlMix(array( + 'MIX_TIMESTAMP' => $mix_time, + 'MIX_NAME' => $mix_name, + 'GROUPS' => array( + array( + 'RESULT_BOUND' => 1, + 'COMPONENTS' => array( + array( + 'CRAWL_TIMESTAMP' => $crawl_time, + 'WEIGHT' => 1.0, + 'KEYWORDS' => $keywords)))))); + + return new MixArchiveBundleIterator($mix_time, $mix_time); + } + + /** + * Retrieves an iterator for an existing crawl mix. The crawl mix remembers + * its previous offset, so that the new iterator picks up where the + * previous one left off. + * + * @param string $label class label of the classifier this crawl mix is + * associated with + * @return object new MixArchiveBundleIterator instance that picks up where + * the previous one left off + */ + function retrieveClassifierCrawlMix($label) + { + $mix_name = Classifier::getCrawlMixName($label); + $mix_time = $this->crawlModel->getCrawlMixTimestamp($mix_name); + return new MixArchiveBundleIterator($mix_time, $mix_time); + } + + /** + * Creates a fresh array from an existing page summary array, and augments + * it with extra data relevant to the labeling interface on the client. + * + * @param array $page original page summary array + * @param float $score classification score (estimated by the Naive Bayes + * text classification algorithm) for $page + * @param float $disagreement disagreement score computed for $page + * @param int $crawl_time index the page came from + * @param string $keywords query supplied to the crawl mix used to find + * $page + * @return array reduced page summary structure containing only the + * information that the client needs to display a summary of the page + */ + function prepareUnlabelledDocument($page, $score, $disagreement, + $crawl_time, $keywords) + { + // Highlight the query keywords, if any. + $disjunct_phrases = explode("|", $keywords); + $words = array(); + foreach ($disjunct_phrases as $disjunct_phrase) { + list($word_struct, $format_words) = + $this->phraseModel->parseWordStructConjunctiveQuery( + $disjunct_phrase); + $words = array_merge($words, $format_words); + } + $title = $this->phraseModel->boldKeywords( + $page[self::TITLE], $words); + $description = $this->phraseModel->getSnippets( + strip_tags($page[self::DESCRIPTION]), $words, 400); + $description = $this->phraseModel->boldKeywords( + $description, $words); + $cache_link = "?c=search&a=cache". + "&q=".urlencode($keywords). + "&arg=".urlencode($page[self::URL]). + "&its=".$crawl_time; + /* + Note that the confidence is a transformation of the score that + converts it into a value between 0 and 1, where it's 0 if the score + was exactly 0.5, and increases toward 1 as the score either + increases toward 1 or decreases toward 0. + */ + return array( + 'title' => $title, + 'url' => $page[self::URL], + 'key' => webencode(Classifier::makeKey($page)), + 'cache_link' => $cache_link, + 'description' => $description, + 'score' => $score, + 'positive' => $score >= 0.5 ? 1 :0, + 'confidence' => abs($score - 0.5) / 0.5, + 'disagreement' => $disagreement); + } +} +?> diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php index af45da888..f36f45115 100755 --- a/controllers/fetch_controller.php +++ b/controllers/fetch_controller.php @@ -37,6 +37,8 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} require_once BASE_DIR."/controllers/controller.php"; /** Loads common constants for web crawling*/ require_once BASE_DIR."/lib/crawl_constants.php"; +/** For user-trained classification of page summaries*/ +require_once BASE_DIR."/lib/classifiers/classifier.php"; /** get available archive iterators */ foreach(glob(BASE_DIR."/lib/archive_bundle_iterators/*_bundle_iterator.php") @@ -228,12 +230,20 @@ class FetchController extends Controller implements CrawlConstants $arctype = $info[self::ARC_TYPE]; $iterator_name = $arctype."Iterator"; - if($info[self::ARC_DIR] == "MIX") { //recrawl of crawl mix case - $archive_iterator = new $iterator_name($iterate_timestamp, - $result_timestamp); - } else { //any other archive crawl except web archive recrawls - $archive_iterator = new $iterator_name($iterate_timestamp, - $info[self::ARC_DIR], $result_timestamp, $result_dir); + if(!class_exists($iterator_name)) { + $info['ARCHIVE_BUNDLE_ERROR'] = + "Invalid bundle iterator: '{$iterator_name}'"; + } else { + if($info[self::ARC_DIR] == "MIX") { + //recrawl of crawl mix case + $archive_iterator = new $iterator_name( + $iterate_timestamp, $result_timestamp); + } else { + //any other archive crawl except web archive recrawls + $archive_iterator = new $iterator_name( + $iterate_timestamp, $info[self::ARC_DIR], + $result_timestamp, $result_dir); + } } } $pages = false; @@ -518,6 +528,18 @@ class FetchController extends Controller implements CrawlConstants $info[$field] = $status[$field]; } } + /* + When initiating a new crawl AND there are active + classifiers (an array of class labels), then augment the + info with compressed, serialized versions of each active + classifier so that each fetcher can reconstruct the same + classifiers. + */ + if (isset($status[self::ACTIVE_CLASSIFIERS])) { + $classifiers_data = Classifier::loadClassifiersData( + $status[self::ACTIVE_CLASSIFIERS]); + $info[self::ACTIVE_CLASSIFIERS_DATA] = $classifiers_data; + } } } diff --git a/controllers/resource_controller.php b/controllers/resource_controller.php index fa79c0b67..ca3392744 100644 --- a/controllers/resource_controller.php +++ b/controllers/resource_controller.php @@ -50,7 +50,7 @@ require_once BASE_DIR."/lib/url_parser.php"; class ResourceController extends Controller implements CrawlConstants { /** - * No models used by this controller + * Models used by this controller * @var array */ var $models = array("crawl"); diff --git a/css/search.css b/css/search.css index 82f34f937..ff20d302b 100755 --- a/css/search.css +++ b/css/search.css @@ -1675,6 +1675,9 @@ ul.in-list li .mixes-table, .mixes-table td, .mixes-table th, +.classifiers-table, +.classifiers-table td, +.classifiers-table th, .search-page-table, .search-page-table td, .search-sources-table, @@ -1687,6 +1690,8 @@ ul.in-list li .html-ltr .file-types-table, .html-ltr .file-types-all td, +.html-ltr .classifiers-table, +.html-ltr .classifiers-all td, .html-ltr .search-page-table, .html-ltr .search-page-all td { @@ -1817,3 +1822,81 @@ ul.in-list li height:0.25in; width: 0.25in; } + +/* + Styles for the classifiers settings pages + */ +#update-accuracy.disabled +{ + color: gray; +} + +#label-docs-form th, +#label-docs-queue .actions +{ + width: 1.0in; + padding: 0.1in 0.2in 0.1in 0in; + font-weight: normal; +} + +#label-docs-queue .actions +{ + padding: 0.1in; +} + +#label-docs-queue +{ + font-size: 11pt; +} + +#label-docs-queue td +{ + vertical-align: top; + padding-top: 10px; +} + +#label-docs-queue .labelled td +{ + background-color: #f0f0f0; +} + +#label-docs-queue .notinclass td +{ + background-color: #FFE0E0; +} + +#label-docs-queue .inclass td +{ + background-color: #F0FFF0; +} + +#label-docs-queue tr.inclass a.inclass, +#label-docs-queue tr.notinclass a.notinclass, +#label-docs-queue tr.skip a.skip +{ + text-decoration: none; + color: black; + cursor: default; + font-weight: bold; +} + +#label-docs-queue p +{ + width: auto; + margin: 3px 0px; +} + +#label-docs-queue .info +{ + padding-left: 7px; +} + +#label-docs-queue .description +{ + color: #666; +} + +#label-docs-queue .description b +{ + color: black; +} diff --git a/data/default.db b/data/default.db index 7fd38ac51..7b2a308fe 100644 Binary files a/data/default.db and b/data/default.db differ diff --git a/index.php b/index.php index 71b2133d0..dadfe1715 100755 --- a/index.php +++ b/index.php @@ -127,7 +127,7 @@ if (function_exists('lcfirst') === false) { $available_controllers = array( "admin", "archive", "cache", "crawl", "fetch", "machine", "resource", "search", "settings", "statistics", - "static",); + "static", "classifier"); if(!WEB_ACCESS) { $available_controllers = array("admin", "archive", "cache", "crawl", "fetch", "machine"); diff --git a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php index 2cb7a4a60..31737221f 100644 --- a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php +++ b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php @@ -72,7 +72,14 @@ class MixArchiveBundleIterator extends ArchiveBundleIterator var $mix_timestamp; /** - * count of how far our into the crawl mix we've gone. + * Used to hold timestamp of the index archive bundle of output results + * + * @var int + */ + var $result_timestamp; + + /** + * count of how far out into the crawl mix we've gone. * * @var int */ @@ -175,8 +182,8 @@ class MixArchiveBundleIterator extends ArchiveBundleIterator */ function nextPages($num, $no_process = false) { + $objects = array("NO_PROCESS" => false); if($this->end_of_iterator) { - $objects = array("NO_PROCESS" => false); return $objects; } $results = $this->searchController->queryRequest($this->query, @@ -189,7 +196,7 @@ class MixArchiveBundleIterator extends ArchiveBundleIterator } else if ($num_results == 0) { $this->end_of_iterator = true; } else { - $objects = array("NO_PROCESS" => $results); + $objects['NO_PROCESS'] = $results; } if(isset($results["SAVE_POINT"]) ){ $end = true; diff --git a/lib/classifiers/classifier.php b/lib/classifiers/classifier.php new file mode 100644 index 000000000..fbad8eeeb --- /dev/null +++ b/lib/classifiers/classifier.php @@ -0,0 +1,1302 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage classifier + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** Common constants for page summaries */ +require_once BASE_DIR."/lib/crawl_constants.php"; +/** Used for keeping track of the vocabulary and feature map */ +require_once BASE_DIR."/lib/classifiers/features.php"; +/** Used to restrict features to an informative subset */ +require_once BASE_DIR."/lib/classifiers/feature_selection.php"; +/** Naive Bayes text classification algorithm */ +require_once BASE_DIR."/lib/classifiers/naive_bayes.php"; +/** Logistic regression text classification algorithm */ +require_once BASE_DIR."/lib/classifiers/lasso_regression.php"; +/** Used to guess locale from a string */ +require_once BASE_DIR."/lib/locale_functions.php"; +/** Used to tokenize page summaries */ +require_once BASE_DIR."/lib/phrase_parser.php"; + +/** + * The primary interface for building and using classifiers. An instance of + * this class represents a single classifier in memory, but the class also + * provides static methods to manage classifiers on disk. + * + * A single classifier is a tool for determining the likelihood that a document + * is a positive instance of a particular class. In order to do this, a + * classifier goes through a training phase on a labeled training set where it + * learns weights for document features (terms, for our purposes). To classify + * a new document, the learned weights for all terms in the document are + * combined in order to yield a pdeudo-probability that the document belongs to + * the class. + * + * A classifier is composed of a candidate buffer, a training set, a set of + * features, and a classification algorithm. In addition to the set of all + * features, there is a restricted set of features used for training and + * classification. There are also two classification algorithms: a Naive Bayes + * algorithm used during labeling, and a logistic regression algorithm used to + * train the final classifier. In general, a fresh classifier will first go + * through a labeling phase where a collection of labeled training documents is + * built up out of existing crawl indexes, and then a finalization phase where + * the logistic regression algorithm will be trained on the training set + * established in the first phase. After finalization, the classifier may be + * used to classify new web pages during a crawl. + * + * During the labeling phase, the classifier fills a buffer of candidate pages + * from the user-selected index (optionally restricted by a query), and tries + * to pick the best one to present to the user to be labeled (here `best' means + * the one that, once labeled, is most likely to improve classification + * accuracy). Each labeled document is removed from the buffer, converted to a + * feature vector (described next), and added to the training set. The expanded + * training set is then used to train an intermediate Naive Bayes + * classification algorithm that is in turn used to more accurately identify + * good candidates for the next round of labeling. This phase continues until + * the user gets tired of labeling documents, or is happy with the estimated + * classification accuracy. + * + * Instead of passing around terms everywhere, each document that goes into the + * training set is first mapped through a Features instance that maps terms to + * feature indices (e.g. "Pythagorean" => 1, "theorem" => 2, etc.). These + * feature indices are used internally by the classification algorithms, and by + * the algorithms that try to pick out the most informative features. In + * addition to keeping track of the mapping between terms and feature indices, + * a Features instance keeps term and label statistics (such as how often a + * term occurs in documents with a particular label) used to weight features + * within a document and to select informative features. Finally, subclasses of + * the Features class weight features in different ways, presenting more or + * less of everything that's known about the frequency or informativeness of a + * feature to classification algorithms. + * + * Once a sufficiently-useful training set has been built, a FeatureSelection + * instance is used to choose the most informative features, and copy these + * into a reduced Features instance that has a much smaller vocabulary, and + * thus a much smaller memory footprint. For efficiency, this is the Features + * instance used to train classification algorithms, and to classify web pages. + * Finalization is just the process of training a logistic regression + * classification algorithm on the full training set. This results in a set of + * feature weights that can be used to efficiently assign a psuedo-probability + * to the proposition that a new web page is a positive instance of the class + * that the classifier has been trained to recognize. Training logistic + * regression on a large training set can take a long time, so this phase is + * carried out asynchronously, by a daemon launched in response to the + * finalization request. + * + * Because the full Features instance, buffer, and training set are only needed + * during the labeling and finalization phases, and because they can get very + * large and take up a lot of space in memory, this class separates its large + * instance members into separate files when serializing to disk. When a + * classifier is first loaded into memory from disk it brings along only its + * summary statistics, since these are all that are needed to, for example, + * display a list of classifiers. In order to actually add new documents to the + * training set, finalize, or classify, the classifier must first be explicitly + * told to load the relevant data structures from disk; this is accomplished by + * methods like prepareToLabel and prepareToClassify. These methods load in + * the relevant serialized structures, and mark the associated data members for + * storage back to disk when (or if) the classifier is serialized again. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +class Classifier implements CrawlConstants +{ + /** + * The maximum number of candidate documents to consider at once in order + * to find the best candidate. + */ + const BUFFER_SIZE = 51; + + /** + * The number of Naive Bayes instances to use to calculate disagreement + * during candidate selection. + */ + const COMMITTEE_SIZE = 3; + + /** + * The maximum disagreement score between candidates. This number depends + * on committee size, and is used to provide a slightly more user-friendly + * estimate of how much disagreement a document causes (between 0 and 1). + */ + const MAX_DISAGREEMENT = 1.63652; // Depends on committee size + + /** + * Lambda parameter used in the computation of a candidate document's + * density (smoothing for 0-frequency terms). + */ + const DENSITY_LAMBDA = 0.5; + + /** + * Beta parameter used in the computation of a candidate document's density + * (sharpness of the KL-divergence). + */ + const DENSITY_BETA = 3.0; + + /** + * Threshold used to convert a pseudo-probability to a hard classification + * decision. Documents with pseudo-probability >= THRESHOLD are classified + * as positive instances. + */ + const THRESHOLD = 0.5; + + /** + * Indicates that a classifier needs to be finalized before it can be used. + */ + const UNFINALIZED = 0; + + /** + * Indicates that a classifier is currently being finalized (this may take + * a while). + */ + const FINALIZING = 1; + + /** + * Indicates that a classifier has been finalized, and is ready to be used + * for classification. + */ + const FINALIZED = 2; + + /** + * Default per-classifier options, which may be overridden when + * constructing a new classifier. The supported options are: + * + * float density.lambda: Lambda parameter used in the computation of a + * candidate document's density (smoothing for 0-frequency terms). + * + * float density.beta: Beta parameter used in the computation of a + * candidate document's density (sharpness of the KL-divergence). + * + * int label_fs.max: Use the `label_fs' most informative features to + * train the Naive Bayes classifiers used during labeling to + * compute disagreement for a document. + * + * float threshold: Threshold used to convert a pseudo-probability to a + * hard classification decision. Documents with pseudo-probability + * >= `threshold' are classified as positive instances. + * + * string final_algo: Algorithm to use for finalization; 'lr' for + * logistic regression, or 'nb' for Naive Bayes; default 'lr'. + * + * int final_fs.max: Use the `final_fs' most informative features to + * train the final classifier. + * + * @var array + */ + var $options = array( + 'density' => array( + 'lambda' => 0.5, + 'beta' => 3.0), + 'threshold' => 0.5, + 'label_fs' => array( + 'max' => 30), + 'final_fs' => array( + 'max' => 200), + 'final_algo' => 'lr'); + + /** + * The label applied to positive instances of the class learned by this + * classifier (e.g., `spam'). + * @var string + */ + var $class_label; + + /** + * Creation time as a UNIX timestamp. + * @var int + */ + var $timestamp; + + /** + * Language of documents in the training set (also how new documents will + * be treated). + * @var string + */ + var $lang; + + /** + * Whether or not this classifier has had any training examples added to + * it, and consequently whether or not its Naive Bayes classification + * algorithm has every been trained. + * @var bool + */ + var $fresh = true; + + /** + * Finalization status, as determined by one of the three finalization + * constants. + * @var int + */ + var $finalized = 0; + + /** + * The number of positive examples in the training set. + * @var int + */ + var $positive = 0; + + /** + * The number of negative examples in the training set. + * @var int + */ + var $negative = 0; + + /** + * The total number of examples in the training set (sum of positive and + * negative). + * @var int + */ + var $total = 0; + + /** + * The estimated classification accuracy. This member may be null if the + * accuracy has not yet been estimated, or out of date if examples have + * been added to the training set since the last accuracy update, but no + * new estimate has been computed. + * @var float + */ + var $accuracy; + + /* + The following properties are all serialized, compressed, and stored in + individual files, then loaded on demand. + */ + + /** + * The current pool of candidates for labeling. The first element in the + * buffer is always the active document, and as active documents are + * labeled and removed, the pool is refreshed with new candidates (if there + * are more pages to be drawn from the active index). The buffer is + * represented as an associative array with three fields: 'docs', the + * candidate page summaries; 'densities', an array of densities computed + * for the documents in the candidate pool; and 'stats', statistics about + * the terms and documents in the current pool. + * @var array + */ + var $buffer; + + /** + * The training set, broken up into two fields of an associative array: + * 'features', an array of document feature vectors; and 'labels', the + * labels assigned to each document. + * @var array + */ + var $docs; + + /** + * The Features subclass instance used to manage the full set of features + * seen across all documents in the training set. + * @var object + */ + var $full_features; + + /** + * The Features subclass instance used to manage the reduced set of + * features used only by Naive Bayes classification algorithms during the + * labeling phase. + * @var object + */ + var $label_features; + + /** + * The NaiveBayes classification algorithm used during training to + * tentatively classify documents presented to the user for labeling. + * @var object + */ + var $label_algorithm; + + /** + * The Features subclass instance used to map documents at classification + * time to the feature vectors expected by classification algorithms. This + * will generally be a reduced feature set, just like that used during + * labeling, but potentially larger than the set used by Naive Bayes. + * @var object + */ + var $final_features; + + /** + * The finalized classification algorithm that will be used to classify new + * web pages. Will usually be logistic regression, but may be Naive Bayes, + * if set by the options. During labeling, this field is a reference to the + * Naive Bayes classification algorithm (so that that algorithm will be + * used by the `classify' method), but it won't be saved to disk as such. + * @var object + */ + var $final_algorithm; + + /** + * The names of properties set by one of the prepareTo* methods; these + * properties will be saved back to disk during serialization, while all + * other properties not listed by the __sleep method will be discarded. + * @var array + */ + var $loaded_properties = array(); + + /* PUBLIC INTERFACE */ + + /** + * Initializes a new classifier with a class label, and options to override + * the defaults. The timestamp associated with the classifier is taken from + * the time of construction. + * + * @param string $label class label applied to positive instances of the + * class this classifier is trained to recognize + * @param array $options optional associative array of options that will + * override the default options + */ + function __construct($label, $options = array()) + { + $this->class_label = $label; + $this->timestamp = time(); + $this->options = array_merge($this->options, $options); + } + + /** + * Magic method that determines which member data will be stored when + * serializing this class. Only lightweight summary data are stored with + * the serialized version of this class. The heavier-weight properties are + * stored in individual, compressed files. + * + * @return array names of properties to store when serializing this + * instance + */ + function __sleep() + { + return array( + 'options', + 'class_label', + 'timestamp', + 'lang', + 'fresh', + 'finalized', + 'positive', + 'negative', + 'total', + 'accuracy'); + } + + /* PREPARING FOR A TASK */ + + /** + * Prepare this classifier instance for labeling. This operation requires + * all of the heavyweight member data save the final features and + * algorithm. Note that these properties are set to references to the + * Naive Bayes features and algorithm, so that Naive Bayes will be used to + * tentatively classify documents during labeling (purely to give the user + * some feedback on how the training set is performing). + */ + function prepareToLabel() + { + $this->loadProperties('buffer', 'docs', 'full_features', + 'label_features', 'label_algorithm'); + if (is_null($this->full_features)) { + $this->full_features = new BinaryFeatures(); + } + if (is_null($this->label_algorithm)) { + $this->label_algorithm = new NaiveBayes(); + } + if (is_null($this->docs)) { + $this->docs = array( + 'features' => array(), + 'labels' => array()); + } + $this->final_features = $this->label_features; + $this->final_algorithm = $this->label_algorithm; + } + + /** + * Prepare to train a final classification algorithm on the full training + * set. This operation requires the full training set and features, but not + * the candidate buffer used during labeling. Note that any existing final + * features and classification algorithm are simply zeroed out; they are + * only loaded from disk so that they will be written back after + * finalization completes. + */ + function prepareToFinalize() + { + $this->finalized = self::FINALIZING; + self::setClassifier($this); + $this->loadProperties('docs', 'full_features', 'final_features', + 'final_algorithm'); + $this->final_features = NULL; + if (strcasecmp($this->options['final_algo'], 'nb') != 0) { + $this->final_algorithm = new LassoRegression(); + } else { + $this->final_algorithm = new NaiveBayes(); + } + } + + /** + * Prepare to classify new web pages. This operation requires only the + * final features and classification algorithm, which are expected to be + * defined after the finalization phase. + */ + function prepareToClassify() + { + $this->loadProperties('final_features', 'final_algorithm'); + } + + /* LABELING PHASE */ + + /** + * Updates the buffer and training set to reflect the label given to a new + * document. The label may be -1, 1, or 0, where the first two correspond + * to a negative or positive example, and the last to a skip. The handling + * for a skip is necessarily different from that for a positive or negative + * label, and matters are further complicated by the possibility that we + * may be changing a label for a document that's already in the training + * set, rather than adding a new document. This function returns true if + * the new label resulted in a change to the training set, and false + * otherwise (i.e., if the user simply skipped labeling the candidate + * document). + * + * When updating an existing document, we will either need to swap the + * label in the training set and update the statistics stored by the + * Features instance (since now the features are associated with a + * different label), or drop the document from the training set and (again) + * update the statistics stored by the Features instance. In either case + * the negative and positive counts must be updated as well. + * + * When working with a new document, we need to remove it from the + * candidate buffer, and if the label is non-zero then we also need to add + * the document to the training set. That involves tokenizing the document, + * passing the tokens through the full_features instance, and storing the + * resulting feature vector, plus the new label in the docs attribute. The + * positive and negative counts must be updated as well. + * + * Finally, if this operation is occurring active labeling (when the user + * is providing labels one at a time), that information needs to be passed + * along to dropBufferDoc, which can avoid doing some work in the + * non-active case. + * + * @param string $key key used to select the document from the docs array + * @param int $label new label (-1, 1, or 0) + * @param bool $is_active whether this operation is being carried out + * during active labeling + * @return bool true if the training set was modified, and false otherwise + */ + function labelDocument($key, $label, $is_active = true) + { + $prev_label = 0; + $labels_changed = true; + if (isset($this->docs['labels'][$key])) { + $prev_label = $this->docs['labels'][$key]; + if ($label != 0) { + $this->full_features->updateExampleLabel( + $this->docs['features'][$key], $prev_label, $label); + $this->docs['labels'][$key] = $label; + // Effectively increment new label and decrement old. + $this->negative += -$label; + $this->positive -= -$label; + } else { + $this->full_features->updateExampleLabel( + $this->docs['features'][$key], $prev_label, 0); + unset($this->docs['features'][$key]); + unset($this->docs['labels'][$key]); + if ($prev_label > 0) { + $this->positive--; + } else { + $this->negative--; + } + } + } else if ($label == 0) { + $labels_changed = false; + $this->dropBufferDoc($is_active); + } else { + if ($label > 0) { + $this->positive++; + } else { + $this->negative++; + } + $doc = $this->buffer['docs'][0]; + $features = $this->full_features->addExample( + $doc['TERMS'], $label); + $this->docs['features'][$key] = $features; + $this->docs['labels'][$key] = $label; + $this->dropBufferDoc($is_active); + } + $this->total = $this->negative + $this->positive; + $this->fresh = false; + if ($labels_changed) { + $this->finalized = self::UNFINALIZED; + } + return $labels_changed; + } + + /** + * Iterates entirely through a crawl mix iterator, adding each document + * (that hasn't already been labeled) to the training set with a single + * label. This function works by running through the iterator, filling up + * the candidate buffer with all unlabeled documents, then repeatedly + * dropping the first buffer document and adding it to the training set. + * Returns the total number of newly-labeled documents. + * + * @param object $mix_iterator crawl mix iterator to draw documents from + * @param int $label label to apply to every document; -1 or 1, but NOT 0 + * @param int $limit optional upper bound on the number of documents to + * add; defaults to no limit + * @return int total number of newly-labeled documents + */ + function addAllDocuments($mix_iterator, $label, $limit = INF) { + $count = $this->initBuffer($mix_iterator, 0); + while (!$mix_iterator->end_of_iterator && $count < $limit) { + $new_pages = $mix_iterator->nextPages(500); + if (isset($new_pages['NO_PROCESS'])) { + unset($new_pages['NO_PROCESS']); + } + $num_pages = 0; + while ($count + $num_pages < $limit && + (list($i, $page) = each($new_pages))) { + $key = self::makeKey($page); + if (!isset($this->docs['labels'][$key])) { + $this->addBufferDoc($page, false); + $num_pages++; + } + } + for ($i = $num_pages; $i > 0; $i--) { + $key = self::makeKey($this->buffer['docs'][0]); + $this->labelDocument($key, $label, false); + } + $count += $num_pages; + } + return $count; + } + + /** + * Drops any existing candidate buffer, re-initializes the buffer + * structure, then calls refreshBuffer to fill it. Takes an optional buffer + * size, which can be used to limit the buffer to something other than the + * number imposed by the runtime parameter. Returns the final buffer size. + * + * @param object $mix_iterator crawl mix iterator to draw documents from + * @param int $buffer_size optional buffer size to use; defaults to the + * runtime parameter + * @return int final buffer size + */ + function initBuffer($mix_iterator, $buffer_size = NULL) + { + $this->buffer = array( + 'docs' => array(), + 'densities' => array(), + 'stats' => array( + 'terms' => array(), + 'num_tokens' => 0, + 'docs' => array(), + 'num_docs' => 0 + ) + ); + return $this->refreshBuffer($mix_iterator, $buffer_size); + } + + /** + * Adds as many new documents to the candidate buffer as necessary to reach + * the specified buffer size, which defaults to the runtime parameter. + * Returns the final buffer size, which may be less than that requested if + * the iterator doesn't return enough documents. + * + * @param object $mix_iterator crawl mix iterator to draw documents from + * @param int $buffer_size optional buffer size to use; defaults to the + * runtime parameter + * @return int final buffer size + */ + function refreshBuffer($mix_iterator, $buffer_size = NULL) + { + if (is_null($buffer_size)) { + $buffer_size = self::BUFFER_SIZE; + } + $num_pages = count($this->buffer['docs']); + while ($num_pages < $buffer_size && + !$mix_iterator->end_of_iterator) { + $batch_size = $buffer_size - $num_pages; + $new_pages = $mix_iterator->nextPages($batch_size); + if (isset($new_pages['NO_PROCESS'])) { + unset($new_pages['NO_PROCESS']); + } + foreach ($new_pages as $page) { + $key = self::makeKey($page); + if (!isset($this->docs['labels'][$key])) { + $this->addBufferDoc($page); + $num_pages++; + } + } + } + return $num_pages; + } + + /** + * Computes from scratch the buffer densities of the documents in the + * current candidate pool. This is an expensive operation that requires + * the computation of the KL-divergence between each ordered pair of + * documents in the pool, approximately O(N^2) computations, total (where N + * is the number of documents in the pool). The densities are saved in the + * buffer data structure. + * + * The density of a document is approximated by its average overlap with + * every other document in the candidate buffer, where the overlap between + * two documents is itself approximated using the exponential, negative + * KL-divergence between them. The KL-divergence is smoothed to deal with + * features (terms) that occur in one distribution (document) but not the + * other, and then multiplied by a negative constant and exponentiated in + * order to convert it to a kind of linear overlap score. + */ + function computeBufferDensities() + { + $this->buffer['densities'] = array(); + $densities =& $this->buffer['densities']; + $stats =& $this->buffer['stats']; + $num_docs = $this->buffer['stats']['num_docs']; + foreach ($stats['docs'] as $i => $doc_i) { + $sum_i = 0.0; + foreach ($stats['docs'] as $h => $doc_h) { + if ($h == $i) { + continue; + } + $sum_ih = 0.0; + foreach ($doc_h as $t => $doc_h_t) { + $p = $doc_h_t; + $q = self::DENSITY_LAMBDA * + (isset($doc_i[$t]) ? $doc_i[$t] : 0.0) + + (1.0 - self::DENSITY_LAMBDA) * + $stats['terms'][$t] / $stats['num_tokens']; + $sum_ih += $p * log($p / $q); + } + $sum_i += -self::DENSITY_BETA * $sum_ih; + } + $densities[] = exp($sum_i / $stats['num_docs']); + } + } + + /** + * Finds the next best document for labeling amongst the documents in the + * candidate buffer, moves that candidate to the front of the buffer, and + * returns it. The best candidate is the one with the maximum product of + * disagreement and density, where the density has already been calculated + * for each document in the current pool, and the disagreement is the + * KL-divergence between the classification scores obtained from a + * committee of Naive Bayes classifiers, each sampled from the current + * set of features. + * + * @return array two-element array containing first the best candidate, and + * second the disagreement score, obtained by dividing the disagreement + * for the document by the maximum disagreement possible for the committee + * size + */ + function findNextDocumentToLabel() + { + if (empty($this->buffer['docs'])) { + return array(NULL, 0.0); + } else if ($this->fresh) { + return array($this->buffer['docs'][0], 0.0); + } + $num_documents = count($this->buffer['docs']); + $doc_ps = array_fill(0, $num_documents, array()); + for ($k = 0; $k < self::COMMITTEE_SIZE; $k++) { + $m = new NaiveBayes(); + $m->sampleBeta($this->label_features); + foreach ($this->buffer['docs'] as $i => $page) { + $x = $this->label_features->mapDocument($page['TERMS']); + $doc_ps[$i][$k] = $m->classify($x); + } + } + $max_disagreement = -INF; + $max_score = -INF; + $best_i = 0; + $densities =& $this->buffer['densities']; + foreach ($doc_ps as $i => $ps) { + $kld = 1.0 + self::klDivergenceToMean($ps); + $score = $kld * $densities[$i]; + if ($score > $max_score) { + $max_disagreement = $kld; + $max_score = $score; + $best_i = $i; + } + } + $doc = $this->buffer['docs'][$best_i]; + $this->moveBufferDocToFront($best_i); + return array($doc, $max_disagreement / self::MAX_DISAGREEMENT); + } + + /** + * Trains the Naive Bayes classification algorithm used during labeling on + * the current training set, and optionally updates the estimated accuracy. + * + * @param bool update_accuracy optional parameter specifying whether or not + * to update the accuracy estimate after training completes; defaults to + * false + */ + function train($update_accuracy = false) + { + $this->label_features = $this->full_features->restrict( + new ChiSquaredFeatureSelection($this->options['label_fs'])); + $this->final_features = $this->label_features; + $X = $this->label_features->mapTrainingSet($this->docs['features']); + $y = array_values($this->docs['labels']); + $this->label_algorithm->train($X, $y); + if ($update_accuracy) { + $this->updateAccuracy($X, $y); + } + } + + /** + * Estimates current classification accuracy using a Naive Bayes + * classification algorithm. Accuracy is estimated by splitting the current + * training set into fifths, reserving four fifths for training, and the + * remaining fifth for testing. A fresh classifier is trained and tested + * on these splits, and the total accuracy recorded. Then the splits are + * rotated so that the previous testing fifth becomes part of the training + * set, and one of the blocks from the previous training set becomes the + * testing set. A new classifier is trained and tested on the new splits, + * and, again, the accuracy recorded. This process is repeated until all + * blocks have been used for testing, and the average accuracy recorded. + * + * @param object $X optional sparse matrix representing the already-mapped + * training set to use; if not provided, the current training set is + * mapped using the label_features property + * @param array $y optional array of document labels corresponding to the + * training set; if not provided the current training set labels are used + */ + function updateAccuracy($X = NULL, $y = NULL) + { + if (is_null($X)) { + $X = $this->label_features->mapTrainingSet( + $this->docs['features']); + } + // Round $m down to nearest multiple of 10, and limit to 250 examples. + $m = min(250, intval(floor($X->rows() / 10)) * 10); + if ($m < 10) { + return; + } + if (is_null($y)) { + $y = array_values($this->docs['labels']); + } + $indices = array_rand($y, $m); + shuffle($indices); + $fold_size = $m / 5; + $divide = 4 * $fold_size; + $sum = 0.0; + for ($i = 0; $i < 5; $i++) { + if ($i > 0) { + $last_block = array_splice($indices, $divide); + array_splice($indices, 0, 0, $last_block); + } + $train_indices = array_slice($indices, 0, $divide); + sort($train_indices); + $test_indices = array_slice($indices, $divide); + sort($test_indices); + list($train_X, $test_X) = $X->partition( + $train_indices, $test_indices); + $train_y = array(); + foreach ($train_indices as $ii) { + $train_y[] = $y[$ii]; + } + $test_y = array(); + foreach ($test_indices as $ii) { + $test_y[] = $y[$ii]; + } + $nb = new NaiveBayes(); + $nb->train($train_X, $train_y); + $correct = 0; + foreach ($test_X as $ii => $x) { + $label = $nb->classify($x) >= 0.5 ? 1 : -1; + if ($label == $test_y[$ii]) { + $correct++; + } + } + $sum += $correct / count($test_y); + } + $this->accuracy = $sum / 5; + } + + /* FINALIZATION PHASE */ + + /** + * Trains the final classification algorithm on the full training set, + * using a subset of the full feature set. The final algorithm will usually + * be logistic regression, but can be set to Naive Bayes with the + * appropriate runtime option. Once finalization completes, updates the + * `finalized' attribute. + */ + function finalize() + { + $this->final_features = $this->full_features->restrict( + new ChiSquaredFeatureSelection($this->options['final_fs'])); + $X = $this->final_features->mapTrainingSet($this->docs['features']); + $y = array_values($this->docs['labels']); + $this->final_algorithm->train($X, $y); + $this->finalized = self::FINALIZED; + } + + /* CLASSIFICATION PHASE */ + + /** + * Classifies a page summary using the current final classification + * algorithm and features, and returns the classification score. This + * method is also used during the labeling phase to provide a tentative + * label for candidates, and in this case the final algorithm is actually a + * reference to a Naive Bayes instance and final_features is a reference to + * label_features; neither of these gets saved to disk, however. + * + * @param array $page page summary array for the page to be classified + * @return float pseudo-probability that the page is a positive instance of + * the target class + */ + function classify($page) + { + /* + Without any features (i.e., no training) there's no support for + either label, so we assume that the score is close to neutral, but + just beneath the threshold. + */ + if ($this->fresh) { + return max(self::THRESHOLD - 1.0E-8, 0.0); + } + $doc = $this->tokenizeDescription($page[self::DESCRIPTION]); + $x = $this->final_features->mapDocument($doc); + return $this->final_algorithm->classify($x); + } + + /* PRIVATE INTERFACE */ + + /** + * Adds a page to the end of the candidate buffer, keeping the associated + * statistics up to date. During active training, each document in the + * buffer is tokenized, and the terms weighted by frequency; the term + * frequencies across documents in the buffer are tracked as well. With no + * active training, the buffer is simply an array of page summaries. + * + * @param array $page page summary for the document to add to the buffer + * @param bool $is_active whether this operation is part of active + * training, in which case some extra statistics must be maintained + */ + function addBufferDoc($page, $is_active = true) + { + $page['TERMS'] = $this->tokenizeDescription($page[self::DESCRIPTION]); + $this->buffer['docs'][] = $page; + if ($is_active) { + $doc = array(); + $doc_length = 0; + foreach ($page['TERMS'] as $term => $count) { + $doc[$term] = $count; + $doc_length += $count; + if (!isset($this->buffer['stats']['terms'][$term])) { + $this->buffer['stats']['terms'][$term] = $count; + } else { + $this->buffer['stats']['terms'][$term] += $count; + } + $this->buffer['stats']['num_tokens'] += $count; + } + foreach ($doc as &$term_count) { + $term_count /= $doc_length; + } + $this->buffer['stats']['docs'][] = $doc; + $this->buffer['stats']['num_docs']++; + } + } + + /** + * Removes the document at the front of the candidate buffer. During active + * training the cross-document statistics for terms occurring in the + * document being removed are maintained. + * + * @param bool $is_active whether this operation is part of active + * training, in which case some extra statistics must be maintained + */ + function dropBufferDoc($is_active = true) + { + $page = array_shift($this->buffer['docs']); + if ($is_active) { + foreach ($page['TERMS'] as $term => $count) { + $this->buffer['stats']['terms'][$term] -= $count; + $this->buffer['stats']['num_tokens'] -= $count; + } + array_shift($this->buffer['stats']['docs']); + $this->buffer['stats']['num_docs']--; + } + } + + /** + * Moves a document in the candidate buffer up to the front, in preparation + * for a label request. The document is specified by its index in the + * buffer. + * + * @param int $i document index within the candidate buffer + */ + function moveBufferDocToFront($i) + { + list($doc) = array_splice($this->buffer['docs'], $i, 1); + array_unshift($this->buffer['docs'], $doc); + list($doc) = array_splice($this->buffer['stats']['docs'], $i, 1); + array_unshift($this->buffer['stats']['docs'], $doc); + } + + /** + * Tokenizes a string into a map from terms to within-string frequencies. + * + * @param string $description string to tokenize + * @return array associative array mapping terms to their within-string + * frequencies + */ + function tokenizeDescription($description) + { + /* + For now, adopt a very simple tokenizing strategy because + extractPhrasesInLists is very slow. + */ + $tokens = preg_split('/\s+/', $description); + $out = array(); + foreach ($tokens as $token) { + if (!$token) + continue; + if (!isset($out[$token])) { + $out[$token] = 1; + } else { + $out[$token]++; + } + } + return $out; + /* + if (is_null($this->lang)) { + $this->lang = guessLocaleFromString($description); + } + $phrases = PhraseParser::extractPhrasesInLists( + $description, $this->lang); + $phrase_counts = array(); + foreach ($phrases as $phrase => $pos_list) { + $phrase_counts[$phrase] = count($pos_list); + } + return $phrase_counts; + */ + } + + /** + * Loads class attributes from compressed, serialized files on disk, and + * stores their names so that they will be saved back to disk later. Each + * property (if it has been previously set) is stored in its own file under + * the classifier's data directory, named after the property. The file is + * compressed using gzip, but without gzip headers, so it can't actually be + * decompressed by the standard gzip utility. If a file doesn't exist, then + * the instance property is left untouched. The property names are passed + * as a variable number of arguments. + * + * @param string $property_name,... variably-sized list of property names + * to try to load data for + */ + function loadProperties(/* args... */) + { + $properties = func_get_args(); + foreach ($properties as $property_name) { + $this->$property_name = NULL; + $filename = WORK_DIRECTORY."/classifiers/".$this->class_label. + "/".$property_name.".txt"; + if (file_exists($filename)) { + $serialized_data = gzuncompress(file_get_contents($filename)); + $data = unserialize($serialized_data); + $this->$property_name = $data; + } + } + $this->loaded_properties = $properties; + } + + /** + * Stores the data associated with each property name listed in the + * loaded_properties instance attribute back to disk. The data for each + * property is stored in its own serialized and compressed file, and made + * world-writable. + */ + function storeLoadedProperties() + { + $properties = $this->loaded_properties; + foreach ($properties as $property_name) { + $filename = WORK_DIRECTORY."/classifiers/".$this->class_label. + "/".$property_name.".txt"; + $serialized_data = serialize($this->$property_name); + file_put_contents($filename, gzcompress($serialized_data)); + chmod($filename, 0777); + } + } + + /* PUBLIC STATIC INTERFACE */ + + /** + * Given a page summary (passed by reference) and a list of classifiers, + * augments the summary meta words with the class label of each classifier + * that scores the summary above a threshold. This static method is used by + * fetchers to classify downloaded pages. In addition to the class label, + * the pseudo-probability that the document belongs to the class is + * recorded as well. This is recorded both as the score rounded down to the + * nearest multiple of ten, and as "<n>plus" for each multiple of ten, n, + * less than the score and greater than or equal to the threshold. + * + * As an example, suppose that a classifier with class label `label' has + * determined that a document is a positive example with pseudo-probability + * 0.87 and threshold 0.5. The following meta words are added to the + * summary: class:label, class:label:80, class:label:80plus, + * class:label:70plus, class:label:60plus, and class:label:50plus. + * + * @param array $summary page summary to classify, passed by reference + * @param array $classifiers list of Classifier instances, each prepared + * for classifying (via the prepareToClassify method) + */ + static function labelPage(&$summary, $classifiers) + { + foreach ($classifiers as $classifier) { + $score = $classifier->classify($summary); + if ($score >= self::THRESHOLD) { + if (!isset($summary[self::META_WORDS])) { + $summary[self::META_WORDS] = array(); + } + $score = intval(floor(($score * 100) / 10) * 10); + $label_score = sprintf("%d", floor($score / 10) * 1000); + $label = $classifier->class_label; + $summary[self::META_WORDS][] = "class:{$label}"; + $summary[self::META_WORDS][] = "class:{$label}:{$label_score}"; + $min_score = intval(self::THRESHOLD * 100); + for ($s = $score; $s >= $min_score; $s -= 10) { + $summary[self::META_WORDS][] = "class:{$label}:{$s}plus"; + } + } + } + } + + /** + * Returns an array of classifier instances currently stored in the + * classifiers directory. The array maps class labels to their + * corresponding classifiers, and each classifier is a minimal instance, + * containing only summary statistics. + * + * @return array associative array of class labels mapped to their + * corresponding classifier instances + */ + static function getClassifierList() + { + $classifiers = array(); + $dirname = WORK_DIRECTORY."/classifiers"; + foreach (glob($dirname."/*") as $classifier_dir) { + $classifier_file = $classifier_dir."/classifier.txt"; + $serialized_data = file_get_contents($classifier_file); + $classifier = unserialize($serialized_data); + $classifiers[$classifier->class_label] = $classifier; + } + return $classifiers; + } + + /** + * Returns the minimal classifier instance corresponding to a class label, + * or NULL if no such classifier exists on disk. + * + * @param string $label classifier's class label + * @return object classifier instance with the relevant class label, or + * NULL if no such classifier exists on disk + */ + static function getClassifier($label) + { + $filename = WORK_DIRECTORY."/classifiers/{$label}/classifier.txt"; + if (file_exists($filename)) { + $serialized_data = file_get_contents($filename); + return unserialize($serialized_data); + } + return NULL; + } + + /** + * Given a list of class labels, returns an array mapping each class label + * to an array of data necessary for initializing a classifier for that + * label. This static method is used to prepare a collection of classifiers + * for distribution to fetchers, so that each fetcher can classify pages as + * it downloads them. The only extra properties passed along in addition to + * the base classification data are the final features and final algorithm, + * both necessary for classifying new documents. + * + * @param array $labels flat array of class labels for which to load data + * @return array associative array mapping class labels to arrays of data + * necessary for initializing the associated classifier + */ + static function loadClassifiersData($labels) + { + $fields = array('classifier', 'final_features', 'final_algorithm'); + $classifiers_data = array(); + foreach ($labels as $label) { + $basedir = WORK_DIRECTORY."/classifiers/{$label}"; + $classifier_data = array(); + foreach ($fields as $field) { + $filename = "{$basedir}/{$field}.txt"; + if (file_exists($filename)) { + /* + The data is web-encoded because it will be sent in an + HTTP response to each fetcher as it prepares for a new + crawl. + */ + $classifier_data[$field] = webencode( + file_get_contents($filename)); + } else { + $classifier_data = false; + break; + } + } + $classifiers_data[$label] = $classifier_data; + } + return $classifiers_data; + } + + /** + * The dual of loadClassifiersData, this static method reconstitutes a + * Classifier instance from an array containing the necessary data. This + * gets called by each fetcher, using the data that it receives from the + * name server when establishing a new crawl. + * + * @param array $data associative array mapping property names to their + * serialized and compressed data + * @return object Classifier instance built from the passed-in data + */ + static function newClassifierFromData($data) + { + if (!isset($data['classifier'])) { + return NULL; + } + $classifier = unserialize(webdecode($data['classifier'])); + unset($data['classifier']); + foreach ($data as $field => $field_data) { + $field_data = webdecode($field_data); + $serialized_data = gzuncompress($field_data); + $classifier->$field = unserialize($serialized_data); + } + $classifier->loaded_properties = array_keys($data); + return $classifier; + } + + /** + * Stores a classifier instance to disk, first separating it out into + * individual files containing serialized and compressed property data. The + * basic classifier information, such as class label and summary + * statistics, is stored uncompressed in a file called `classifier.txt'. + * The classifier directory and all of its contents are made world-writable + * so that they can be manipulated without hassle from the command line. + * + * @param object Classifier instance to store to disk + */ + static function setClassifier($classifier) + { + $dirname = WORK_DIRECTORY."/classifiers/".$classifier->class_label; + if (!file_exists($dirname)) { + mkdir($dirname); + chmod($dirname, 0777); + } + $classifier->storeLoadedProperties(); + $label = $classifier->class_label; + $filename = $dirname."/classifier.txt"; + $serialized_data = serialize($classifier); + file_put_contents($filename, $serialized_data); + chmod($filename, 0777); + } + + /** + * Deletes the directory corresponding to a class label, and all of its + * contents. In the case that there is no classifier with the passed in + * label, does nothing. + * + * @param string $label class label of the classifier to be deleted + */ + static function deleteClassifier($label) + { + $dirname = WORK_DIRECTORY."/classifiers/{$label}"; + if (file_exists($dirname)) { + $db_class = ucfirst(DBMS)."Manager"; + $db = new $db_class(); + $db->unlinkRecursive($dirname); + } + } + + /** + * Removes all but alphanumeric characters and underscores from a label, so + * that it may be easily saved to disk and used in queries as a meta word. + * + * @param string $label class label to clean + */ + static function cleanLabel($label) + { + return preg_replace('/[^a-zA-Z0-9_]/', '', $label); + } + + /** + * Returns a name for the crawl mix associated with a class label. + * + * @param string $label class label associated with the crawl mix + * @return string name that can be used for the crawl mix associated with + * $label + */ + static function getCrawlMixName($label) + { + return 'CLASSIFY_'.$label; + } + + /** + * Returns a key that can be used internally to refer internally to a + * particular page summary. + * + * @param array $page page summary to return a key for + * @return string key that uniquely identifies the page summary + */ + static function makeKey($page) + { + return md5($page[self::URL]); + } + + /* PRIVATE STATIC INTERFACE */ + + /** + * Calculates the KL-divergence to the mean for a collection of discrete + * two-element probability distributions. Each distribution is specified by + * a single probability, p, since the second probability is just 1 - p. The + * KL-divergence to the mean is used as a measure of disagreement between + * members of a committee of classifiers, where each member assigns a + * classification score to the same document. + * + * @param array $ps probabilities describing several discrete two-element + * probability distributions + * @return float KL-divergence to the mean for the collection of + * distributions + */ + static function klDivergenceToMean($ps) + { + $k = count($ps); + $mean = array_sum($ps) / $k; + $mean = max(min($mean, 1.0 - 1.0E-8), 1.0E-8); + $kld = 0.0; + foreach ($ps as $p) { + $p = max(min($p, 1.0 - 1.0E-8), 1.0E-8); + $kld += $p * log($p / $mean); + $kld += (1 - $p) * log((1 - $p) / (1 - $mean)); + } + return $kld / $k; + } +} +?> \ No newline at end of file diff --git a/lib/classifiers/classifier_algorithm.php b/lib/classifiers/classifier_algorithm.php new file mode 100644 index 000000000..c73b49c1e --- /dev/null +++ b/lib/classifiers/classifier_algorithm.php @@ -0,0 +1,59 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage classifier + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * An abstract class shared by classification algorithms that implement a + * common interface. + * + * This base class implements a few administrative utility methods that all + * classification algorithms can take advantage of. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +abstract class ClassifierAlgorithm +{ + // TODO: Add in automatic parameter setting, and better logging facilities, + // similar to those used by classifier_tool. + var $debug = 0; + + function log($message) + { + if ($this->debug > 0) { + crawlLog($message); + } + } +} diff --git a/lib/classifiers/feature_selection.php b/lib/classifiers/feature_selection.php new file mode 100644 index 000000000..c5c236616 --- /dev/null +++ b/lib/classifiers/feature_selection.php @@ -0,0 +1,176 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage classifier + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * This is an abstract class that specifies an interface for selecting top + * features from a dataset. + * + * Each FeatureSelection class implements a select method that takes a Features + * instance and returns a mapping from a subset of the old feature indices to + * new ones. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +abstract class FeatureSelection +{ + /** + * Sets any passed runtime parameters. + * + * @param array $parameters optional associative array of parameters to + * replace the default ones with + */ + function __construct($parameters = array()) + { + foreach ($parameters as $parameter => $value) { + $this->$parameter = $value; + } + } + + /** + * Constructs a map from old feature indices to new ones according to a + * max-heap of the most informative features. Always keep feature index 0, + * which is used as an intercept term. + * + * @param object $selected max heap containing entries ordered by + * informativeness and feature index. + * @return array associative array mapping a subset of the original feature + * indices to the new indices + */ + function buildMap($selected) + { + $keep_features = array(0 => 0); + $i = 1; + while (!$selected->isEmpty()) { + list($chi2, $j) = $selected->extract(); + $keep_features[$j] = $i++; + } + return $keep_features; + } + + /** + * Computes the top features of a Features instance, and returns a mapping + * from a subset of those features to new contiguous indices. The mapping + * allows documents that have already been mapped into the larger feature + * space to be converted to the smaller feature space, while keeping the + * feature indices contiguous (e.g., 1, 2, 3, 4, ... instead of 22, 35, 75, + * ...). + * + * @param object $features Features instance + * @return array associative array mapping a subset of the original feature + * indices to new indices + */ + abstract function select(Features $features); +} + + +/** + * A subclass of FeatureSelection that implements chi-squared feature + * selection. + * + * This feature selection method scores each feature according to its + * informativeness, then selects the top N most informative features, where N + * is a run-time parameter. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +class ChiSquaredFeatureSelection extends FeatureSelection +{ + /** + * The maximum number of features to select, a runtime parameter. + * @var int + */ + var $max; + + /** + * Uses the chi-squared feature selection algorithm to rank features by + * informativeness, and return a map from old feature indices to new ones. + * + * @param object $features full feature set + * @return array associative array mapping a subset of the original feature + * indices to new indices + */ + function select(Features $features) + { + $n = $features->numFeatures(); + $selected = new SplMinHeap(); + $allowed = isset($this->max) ? min($this->max, $n) : $n; + $labels = array(-1, 1); + + /* + Start with 1, since 0 is dedicated to the constant intercept term; + <= $n because n is the last feature. + */ + for ($j = 1; $j <= $n; $j++) { + $max_chi2 = 0.0; + foreach ($labels as $label) { + /* + t = term present + l = document has label + n = negation + */ + $stats = $features->varStats($j, $label); + list($t_l, $t_nl, $nt_l, $nt_nl) = $stats; + $num = ($t_l * $nt_nl) - ($t_nl * $nt_l); + $den = ($t_l + $t_nl) * ($nt_l + $nt_nl); + $chi2 = $den != 0 ? ($num * $num) / $den : INF; + if ($chi2 > $max_chi2) { + $max_chi2 = $chi2; + } + } + + /* + Keep track of top features in a heap, as we compute + informativeness. + */ + if ($allowed > 0) { + $selected->insert(array($max_chi2, $j)); + $allowed -= 1; + } else { + list($other_chi2, $_) = $selected->top(); + if ($max_chi2 > $other_chi2) { + $selected->extract(); + $selected->insert(array($max_chi2, $j)); + } + } + } + + return $this->buildMap($selected); + } +} +?> \ No newline at end of file diff --git a/lib/classifiers/features.php b/lib/classifiers/features.php new file mode 100644 index 000000000..5aa3aa392 --- /dev/null +++ b/lib/classifiers/features.php @@ -0,0 +1,571 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage classifier + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + +/** + * Manages a dataset's features, providing a standard interface for converting + * documents to feature vectors, and for accessing feature statistics. + * + * Each document in the training set is expected to be fed through an instance + * of a subclass of this abstract class in order to convert it to a feature + * vector. Terms are replaced with feature indices (e.g., 'Pythagorean' => 1, + * 'theorem' => 2, and so on), which are contiguous. The value at a feature + * index is determined by the subclass; one might weight terms according to how + * often they occur in the document, while another might use a simple binary + * representation. The feature index 0 is reserved for an intercept term, which + * always has a value of one. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +abstract class Features +{ + /** + * Maps terms to their feature indices, which start at 1. + * @var array + */ + var $vocab = array(); + + /** + * Maps terms to how often they occur in documents by label. + * @var array + */ + var $var_freqs = array(); + + /** + * Maps labels to the number of documents they're assigned to. + * @var array + */ + var $label_freqs = array(-1 => 0, 1 => 0); + + /** + * Maps old feature indices to new ones when a feature subset operation has + * been applied to restrict the number of features. + * @var array + */ + var $feature_map; + + /** + * A list of the top terms according to the last feature subset operation, + * if any. + * @var array + */ + var $top_terms = array(); + + /** + * Maps a new example to a feature vector, adding any new terms to the + * vocabulary, and updating term and label statistics. The example should + * be an array of terms and their counts, and the output simply replaces + * terms with feature indices. + * + * @param array $terms array of terms mapped to the number of times they + * occur in the example + * @param int $label label for this example, either -1 or 1 + * @return array input example with terms replaced by feature indices + */ + function addExample($terms, $label) + { + $this->label_freqs[$label]++; + $features = array(); + foreach ($terms as $term => $count) { + if (isset($this->vocab[$term])) { + $j = $this->vocab[$term]; + } else { + // Var indices start at 1 to accommodate the intercept at 0. + $j = count($this->vocab) + 1; + $this->vocab[$term] = $j; + } + $features[$j] = $count; + // Update term statistics + if (!isset($this->var_freqs[$j][$label])) { + $this->var_freqs[$j][$label] = 1; + } else { + $this->var_freqs[$j][$label]++; + } + } + // Feature 0 is an intercept term + $features[0] = 1; + ksort($features); + return $features; + } + + /** + * Updates the label and term statistics to reflect a label change for an + * example from the training set. A new label of 0 indicates that the + * example is being removed entirely. Note that term statistics only count + * one occurrence of a term per example. + * + * @param array $features feature vector from when the example was + * originally added + * @param int $old_label old example label in {-1, 1} + * @param int $new_label new example label in {-1, 0, 1}, where 0 indicates + * that the example should be removed entirely + */ + function updateExampleLabel($features, $old_label, $new_label) + { + $this->label_freqs[$old_label]--; + if ($new_label != 0) { + $this->label_freqs[$new_label]++; + } + // Remove the intercept term first. + unset($features[0]); + foreach (array_keys($features) as $j) { + $this->var_freqs[$j][$old_label]--; + if ($new_label != 0) { + $this->var_freqs[$j][$new_label]++; + } + } + } + + /** + * Returns the number of features, not including the intercept term + * represented by feature zero. For example, if we had features 0..10, + * this function would return 10. + * + * @return int the number of features in the training set + */ + function numFeatures() + { + return count($this->vocab); + } + + /** + * Returns the positive and negative label counts for the training set. + * + * @return array positive and negative label counts indexed by label, + * either 1 or -1 + */ + function labelStats() + { + return array($this->label_freqs[1], $this->label_freqs[-1]); + } + + /** + * Returns the statistics for a particular feature and label in the + * training set. The statistics are counts of how often the term appears or + * fails to appear in examples with or without the target label. They are + * returned in a flat array, in the following order: + * + * 0 => # examples where feature present, label matches + * 1 => # examples where feature present, label doesn't match + * 2 => # examples where feature absent, label matches + * 3 => # examples where feature absent, label doesn't match + * + * @param int $j feature index + * @param int $label target label + * @return array feature statistics in 4-element flat array + */ + function varStats($j, $label) + { + $tl = isset($this->var_freqs[$j][$label]) ? + $this->var_freqs[$j][$label] : 0; + $t = array_sum($this->var_freqs[$j]); + $l = $this->label_freqs[$label]; + $N = array_sum($this->label_freqs); + return array( + $tl, // t and l + $t - $tl, // t and ~l + $l - $tl, // ~t and l + $N - $t - $l + $tl // ~t and ~l + ); + } + + /** + * Given a FeatureSelection instance, return a new clone of this Features + * instance using a restricted feature subset. The new Features instance + * is augmented with a feature map that it can use to convert feature + * indices from the larger feature set to indices for the reduced set. + * + * @param object $fs FeatureSelection instance to be used to select the + * most informative terms + * @return object new Features instance using the restricted feature set + */ + function restrict(FeatureSelection $fs) + { + $feature_map = $fs->select($this); + /* + Collect the top few most-informative features (if any). The features + are inserted into the feature map by decreasing informativeness, so + iterating through from the beginning will yield the most informative + features first, excepting the very first one, which is guaranteed to + be the intercept term. + */ + $top_features = array(); + next($feature_map); + for ($i = 0; $i < 5; $i++) { + if (!(list($j) = each($feature_map))) { + break; + } + $top_features[$j] = true; + } + $classname = get_class($this); + $new_features = new $classname; + foreach ($this->vocab as $term => $old_j) { + if (isset($feature_map[$old_j])) { + $new_j = $feature_map[$old_j]; + $new_features->vocab[$term] = $new_j; + $new_features->var_freqs[$new_j] = $this->var_freqs[$old_j]; + // Get the actual term associated with a top feature. + if (isset($top_features[$old_j])) { + $top_features[$old_j] = $term; + } + } + } + $new_features->label_freqs = $this->label_freqs; + $new_features->feature_map = $feature_map; + // Note that this preserves the order of top features. + $new_features->top_terms = array_values($top_features); + return $new_features; + } + + /** + * Maps the indices of a feature vector to those used by a restricted + * feature set, dropping and features that aren't in the map. If this + * Features instance isn't restricted, then the passed-in features are + * returned unmodified. + * + * @param array $features feature vector mapping feature indices to + * frequencies + * @return array original feature vector with indices mapped + * according to the feature_map property, and any features that don't + * occcur in feature_map dropped + */ + function mapToRestrictedFeatures($features) + { + if (empty($this->feature_map)) { + return $features; + } + $mapped_features = array(); + foreach ($features as $j => $count) { + if (isset($this->feature_map[$j])) { + $mapped_features[$this->feature_map[$j]] = $count; + } + } + return $mapped_features; + } + + /** + * Given an array of feature vectors mapping feature indices to counts, + * returns a sparse matrix representing the dataset transformed according + * to the specific Features subclass. A Features subclass might use simple + * binary features, but it might also use some form of TF * IDF, which + * requires the full dataset in order to assign weights to particular + * document features; thus the necessity of a map over the entire training + * set prior to its input to a classification algorithm. + * + * @param array $docs array of training examples represented as feature + * vectors where the values are per-example counts + * @return object SparseMatrix instance whose rows are the transformed + * feature vectors + */ + abstract function mapTrainingSet($docs); + + /** + * Maps a vector of terms mapped to their counts within a single document + * to a transformed feature vector, exactly like a row in the sparse matrix + * returned by mapTrainingSet. This method is used to transform a tokenized + * document prior to classification. + * + * @param array $tokens associative array of terms mapped to their + * within-document counts + * @return array feature vector corresponding to the tokens, mapped + * according to the implementation of a particular Features subclass + */ + abstract function mapDocument($tokens); +} + + +/** + * A concrete Features subclass that represents a document as a binary + * vector where a one indicates that a feature is present in the document, and + * a zero indicates that it is not. The absent features are ignored, so the + * binary vector is actually sparse, containing only those feature indices + * where the value is one. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +class BinaryFeatures extends Features +{ + /** + * Replaces term counts with 1, indicating only that a feature occurs in a + * document. When a Features instance is a subset of a larger instance, it + * will have a feature_map member that maps feature indices from the larger + * feature set to the smaller one. The indices must be mapped in this way + * so that the training set can retain complete information, only throwing + * away features just before training. See the abstract parent class for a + * more thorough introduction to the interface. + * + * @param array $docs array of training examples represented as feature + * vectors where the values are per-example counts + * @return object SparseMatrix instance whose rows are the transformed + * feature vectors + */ + function mapTrainingSet($docs) + { + $m = count($docs); + $n = count($this->vocab) + 1; + $X = new SparseMatrix($m, $n); + + $i = 0; + foreach ($docs as $features) { + /* + If this is a restricted feature set, map from the expanded + feature set first, potentially dropping features. + */ + $features = $this->mapToRestrictedFeatures($features); + $new_features = array_combine( + array_keys($features), + array_fill(0, count($features), 1)); + $X->setRow($i++, $new_features); + } + + return $X; + } + + /** + * Converts a map from terms to within-document term counts with the + * corresponding sparse binary feature vector used for classification. + * + * @param array $tokens associative array of terms mapped to their + * within-document counts + * @return array feature vector corresponding to the tokens, mapped + * according to the implementation of a particular Features subclass + */ + function mapDocument($tokens) + { + $x = array(); + foreach ($tokens as $token => $count) { + if (isset($this->vocab[$token])) { + $x[$this->vocab[$token]] = 1; + } + } + $x[0] = 1; + ksort($x); + return $x; + } +} + + +/** + * A concrete Features subclass that represents a document as a + * vector of feature weights, where weights are computed using a modified form + * of TF * IDF. This feature mapping is experimental, and may not work + * correctly. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +class WeightedFeatures extends Features +{ + var $D = 0; + var $n = array(); + + function mapTrainingSet($docs) + { + $m = count($this->examples); + $n = count($this->vocab); + + $this->D = $m; + $this->n = array(); + + // Fill in $n, the count of documents that contain each term + foreach ($this->examples as $features) { + foreach (array_keys($features) as $j) { + if (!isset($this->n[$j])) + $this->n[$j] = 1; + else + $this->n[$j] += 1; + } + } + + $X = new SparseMatrix($m, $n); + $y = $this->exampleLabels; + + foreach ($this->examples as $i => $features) { + $u = array(); + $sum = 0; + + // First compute the unnormalized TF * IDF term weights and keep + // track of the sum of all weights in the document. + foreach ($features as $j => $count) { + $tf = 1 + log($count); + $idf = log(($this->D + 1) / ($this->n[$j] + 1)); + $weight = $tf * $idf; + $u[$j] = $weight; + $sum += $weight * $weight; + } + + // Now normalize each of the term weights. + $norm = sqrt($sum); + foreach (array_keys($features) as $j) { + $features[$j] = $u[$j] / $norm; + } + $X->setRow($i, $features); + } + + return array($X, $y); + } + + function mapDocument($tokens) + { + $u = array(); + $sum = 0; + + ksort($this->current); + + foreach ($this->current as $j => $count) { + $tf = 1 + log($count); + $idf = log(($this->D + 1) / ($this->n[$j] + 1)); + $weight = $tf * $idf; + $u[$j] = $weight; + $sum += $weight * $weight; + } + + $norm = sqrt($sum); + $x = array(); + foreach (array_keys($this->current) as $j) { + $x[$j] = $u[$j] / $norm; + } + + $this->current = array(); + return $x; + } +} + + +/** + * A sparse matrix implementation based on an associative array of associative + * arrays. + * + * A SparseMatrix is mostly a wrapper around an array of arrays, but it keeps + * track of some extra information such as the true matrix dimensions, and the + * number of non-zero entries. It also provides a convenience method for + * partitioning the matrix rows into two new sparse matrices. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +class SparseMatrix implements Iterator +{ + /** + * The number of rows, regardless of whether or not some are empty. + * @var int + */ + var $m; + + /** + * The number of columns, regardless of whether or not some are empty. + * @var int + */ + var $n; + + /** + * The number of non-zero entries. + * @var int + */ + var $nonzero = 0; + + /** + * The actual matrix data, an associative array mapping row indices to + * associative arrays mapping column indices to their values. + * @var array + */ + var $data; + + /** + * Initializes a new sparse matrix with specific dimensions. + * + * @param int $m number of rows + * @param int $n number of columns + */ + function __construct($m, $n) + { + $this->m = $m; + $this->n = $n; + $this->data = array(); + } + + function rows() { return $this->m; } + function columns() { return $this->n; } + function nonzero() { return $this->nonzero; } + + /** + * Sets a particular row of data, keeping track of any new non-zero + * entries. + * + * @param int $i row index + * @param array $row associative array mapping column indices to values + */ + function setRow($i, $row) + { + $this->data[$i] = $row; + $this->nonzero += count($row); + } + + /** + * Given two sets of row indices, returns two new sparse matrices + * consisting of the corresponding rows. + * + * @param array $a_indices row indices for first new sparse matrix + * @param array $b_indices row indices for second new sparse matrix + * @return array array with two entries corresponding to the first and + * second new matrices + */ + function partition($a_indices, $b_indices) + { + $a = new SparseMatrix(count($a_indices), $this->n); + $b = new SparseMatrix(count($b_indices), $this->n); + $new_i = 0; + foreach ($a_indices as $i) { + $a->setRow($new_i++, $this->data[$i]); + } + $new_i = 0; + foreach ($b_indices as $i) { + $b->setRow($new_i++, $this->data[$i]); + } + return array($a, $b); + } + + /* Iterator Interface */ + + function rewind() { reset($this->data); } + function current() { return current($this->data); } + function key() { return key($this->data); } + function next() { return next($this->data); } + function valid() { return !is_null(key($this->data)); } +} +?> \ No newline at end of file diff --git a/lib/classifiers/lasso_regression.php b/lib/classifiers/lasso_regression.php new file mode 100644 index 000000000..52da44901 --- /dev/null +++ b/lib/classifiers/lasso_regression.php @@ -0,0 +1,429 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage classifier + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** Base class definition */ +require_once BASE_DIR."/lib/classifiers/classifier_algorithm.php"; + +/** + * Implements the logistic regression text classification algorithm using lasso + * regression and a cyclic coordinate descent optimization step. + * + * This algorithm is rather slow to converge for large datasets or a large + * number of features, but it does provide regularization in order to combat + * over-fitting, and out-performs Naive-Bayes in tests on the same data set. + * The algorithm augments a standard cyclic coordinate descent approach by + * ``sleeping'' features that don't significantly change during a single step. + * Each time an optimization step for a feature doesn't change the feature + * weight beyond some threshold, that feature is forced to sit out the next + * optimization round. The threshold increases over successive rounds, + * effectively placing an upper limit on the number of iterations over all + * features, while simultaneously limiting the number of features updated on + * each round. This optimization speeds up convergence, but at the cost of some + * accuracy. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +class LassoRegression extends ClassifierAlgorithm +{ + /** + * Level of detail to be used for logging. Higher values mean more detail. + * @var int + */ + var $debug = 0; + + /** + * Threshold used to determine convergence. + * @var float + */ + var $epsilon = 0.001; + + /** + * Lambda parameter to CLG algorithm. + * @var float + */ + var $lambda = 1.0; + + /** + * Beta vector of feature weights resulting from the training phase. The + * dot product of this vector with a feature vector yields the log + * likelihood that the feature vector describes a document belonging to the + * trained-for class. + * @var array + */ + var $beta; + + /** + * An adaptation of the Zhang-Oles 2001 CLG algorithm by Genkin et al. to + * use the Laplace prior for parameter regularization. On completion, + * optimizes the beta vector to maximize the likelihood of the data set. + * + * @param object $X SparseMatrix representing the training dataset + * @param array $y array of known labels corresponding to the rows of $X + */ + function train($X, $y) + { + $invX = new InvertedData($X); + $this->lambda = $this->estimateLambdaNorm($invX); + $m = $invX->rows(); + $n = $invX->columns(); + $this->beta = array_fill(0, $n, 0.0); + $beta =& $this->beta; + $lambda = $this->lambda; + $d = array_fill(0, $n, 1.0); + $r = array_fill(0, $m, 0.0); + $converged = false; + + $drSum = 0.0; + $rSum = 0.0; + $change = 0.0; + $score = 0.0; + + $minDrj = $this->epsilon; + $prevDrj = $this->epsilon; + $schedule = new SplMaxHeap(); + $nextSchedule = new SplMaxHeap(); + + for ($j = 0; $j < $n; $j++) + $schedule->insert(array($this->epsilon, $j)); + + for ($k = 0; !$converged; $k++) { + $prevR = $r; + + $var = 1; + while (!$schedule->isEmpty()) { + list($drj, $j) = $schedule->top(); + + if ($drj < $minDrj /*|| $drj / $prevDrj < 0.25*/) { + break; + } else { + $schedule->extract(); + $prevDrj = $drj; + } + + $Xj = $invX->iterateColumn($j); + + list($numer, $denom) = $this->computeApproxLikelihood( + $Xj, $y, $r, $d[$j]); + + // Compute tentative step $dvj + if ($beta[$j] == 0) { + $dvj = ($numer - $lambda) / $denom; + if ($dvj <= 0) { + $dvj = ($numer + $lambda) / $denom; + if ($dvj >= 0) + $dvj = 0; + } + } else { + $s = $beta[$j] > 0 ? 1 : -1; + $dvj = ($numer - ($s * $lambda)) / $denom; + if ($s * ($beta[$j] + $dvj) < 0) + $dvj = -$beta[$j]; + } + + if ($dvj == 0) { + $d[$j] /= 2; + $nextSchedule->insert(array($this->epsilon, $j, $k)); + } else { + // Compute delta for beta[j], constrained to trust region. + $dbetaj = min(max($dvj, -$d[$j]), $d[$j]); + + // Update our cached dot product by the delta. + $drj = 0.0; + foreach ($Xj as $cell) { + list($_, $i, $Xij) = $cell; + $dr = $dbetaj * $Xij; + $drj += $dr; + $r[$i] += $dr; + } + + $drj = abs($drj); + $nextSchedule->insert(array($drj, $j, $k)); + + $beta[$j] += $dbetaj; + + // Update the trust region. + $d[$j] = max(2 * abs($dbetaj), $d[$j] / 2); + } + + if ($this->debug > 1) { + $score = $this->score($r, $y, $beta); + } + + $this->log(sprintf( + "itr = %3d, j = %4d (#%d), score = %6.2f, change = %6.4f", + $k + 1, $j, $var, $score, $change)); + + $var++; + } + + // Update $converged + + $drSum = 0.0; + $rSum = 0.0; + for ($i = 0; $i < $m; $i++) { + $drSum += abs($r[$i] - $prevR[$i]); + $rSum += abs($r[$i]); + } + $change = $drSum / (1 + $rSum); + + $converged = $change <= $this->epsilon; + + while (!$schedule->isEmpty()) { + list($drj, $j) = $schedule->extract(); + $nextSchedule->insert(array($drj * 4, $j)); + } + + $tmp = $schedule; + $schedule = $nextSchedule; + $nextSchedule = $tmp; + + $minDrj *= 2; + } + } + + /** + * Returns the pseudo-probability that a new instance is a positive example + * of the class the beta vector was trained to recognize. It only makes + * sense to try classification after at least some training + * has been done on a dataset that includes both positive and negative + * examples of the target class. + * + * @param array $x feature vector represented by an associative array + * mapping features to their weights + */ + function classify($x) + { + $l = 0.0; + foreach ($x as $j => $xj) { + $l += $xj * $this->beta[$j]; + } + return 1.0 / (1.0 + exp(-$l)); + } + + /* PRIVATE INTERFACE */ + + /** + * Computes the approximate likelihood of y given a single feature, and + * returns it as a pair <numerator, denominator>. + * + * @param object $Xj iterator over the non-zero entries in column j of the + * data + * @param array $y labels corresponding to entries in $Xj; each label is 1 + * if example i has the target label, and -1 otherwise + * @param array $r cached dot products of the beta vector and feature + * weights for each example i + * @param float $d trust region for feature j + * @return array two-element array containing the numerator and denominator + * of the likelihood + */ + function computeApproxLikelihood($Xj, $y, $r, $d) + { + $numer = 0.0; + $denom = 0.0; + + foreach ($Xj as $cell) { + list($j, $i, $Xij) = $cell; + + $yi = $y[$i]; + $ri = $yi * $r[$i]; + $a = abs($ri); + $b = abs($d * $Xij); + if ($a <= $b) { + $F = 0.25; + } else { + $e = exp($a - $b); + $F = 1.0 / (2.0 + $e + (1.0/$e)); + } + $numer += $Xij * $yi / (1 + exp($ri)); + $denom += $Xij * $Xij * $F; + } + + return array($numer, $denom); + } + + /** + * Computes an approximate score that can be used to get an idea of how + * much a given optimization step improved the likelihood of the data set. + * + * @param array $r cached dot products of the beta vector and feature + * weights for each example i + * @param array $y labels for each example + * @param array $beta beta vector of feature weights (used to + * penalize large weights) + * @return float value proportional to the likelihood of the data, + * penalized by the magnitude of the beta vector + */ + function score($r, $y, $beta) + { + $score = 0; + foreach ($r as $i => $ri) + $score += -log(1 + exp(-$ri * $y[$i])); + return $score - array_sum($beta); + } + + /** + * Estimates the lambda parameter from the dataset. + * + * @param object $invX inverted X matrix for dataset (essentially a posting + * list of features in X) + * @return float lambda estimate + */ + function estimateLambdaNorm($invX) + { + $sqNorm = 0; + foreach ($invX->iterateData() as $entry) { + $Xij = $entry[2]; + $sqNorm += $Xij * $Xij; + } + + $m = $invX->rows(); + $n = $invX->columns(); + $sigmaSq = $n * $m / $sqNorm; + return sqrt(2) / sqrt($sigmaSq); + } +} + +/** + * Stores a data matrix in an inverted index on columns with non-zero entries. + * + * The index is just an array of entries <j, i, X[i][j]> sorted first by j and + * then by i, where all X[i][j] > 0. Provides a method to iterate over all rows + * which have a non-zero entry for a particular column (feature) j. There is + * no efficient way to iterate over rows in order. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +class InvertedData +{ + /** + * Number of rows in the matrix. + * @var int + */ + var $rows; + + /** + * Number of columns in the matrix. + * @var int + */ + var $columns; + + /** + * Array of non-zero matrix entries. + * @var array + */ + var $data; + + /** + * Array of offsets into the $data array, where each offset gives the start + * of the entries for a particular feature. + * @var array + */ + var $index; + + /** + * Converts a SparseMatrix into an InvertedData instance. The data is + * duplicated. + * + * @param object $X SparseMatrix instance to convert + */ + function __construct(SparseMatrix $X) + { + $this->rows = $X->rows(); + $this->columns = $X->columns(); + $this->data = array(); + $this->index = array(); + + foreach ($X as $i => $row) { + foreach ($row as $j => $Xij) { + $this->data[] = array($j, $i, $Xij); + } + } + + sort($this->data); + + $lastVar = -1; + foreach ($this->data as $dataOffset => $x) { + $currVar = $x[0]; + if ($currVar != $lastVar) { + for ($var = $lastVar + 1; $var <= $currVar; $var++) + $this->index[$var] = $dataOffset; + $lastVar = $currVar; + } + } + } + + function rows() { return $this->rows; } + function columns() { return $this->columns; } + + /** + * Returns an iterator over the values for a particular column of the + * matrix. If no matrix entry in the column is non-zero then an empty + * iterator is returned. + * + * @param into $j feature index (column) to iterate over + * @return object iterator over values in the column + */ + function iterateColumn($j) + { + $start = $this->index[$j]; + if ($j < count($this->index) - 1) + $count = $this->index[$j + 1] - $start; + else + $count = -1; + + if ($count != 0) { + $arrItr = new ArrayIterator($this->data); + return new LimitIterator($arrItr, $start, $count); + } + + return new EmptyIterator(); + } + + /** + * Returns an iterator over the entire matrix. Note that this iterator is + * not in row order, but effectively in column order. + * + * @return object iterator over every non-zero entry in the matrix + */ + function iterateData() + { + return new ArrayIterator($this->data); + } +} +?> diff --git a/lib/classifiers/naive_bayes.php b/lib/classifiers/naive_bayes.php new file mode 100644 index 000000000..39cd9ffaf --- /dev/null +++ b/lib/classifiers/naive_bayes.php @@ -0,0 +1,201 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage classifier + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** Base class definition */ +require_once BASE_DIR."/lib/classifiers/classifier_algorithm.php"; + +/** + * Implements the Naive Bayes text classification algorithm. + * + * This class also provides a method to sample a beta vector from a dataset, + * making it easy to generate several slightly-different classifiers for the + * same dataset in order to form classifier committees. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage classifier + */ +class NaiveBayes extends ClassifierAlgorithm +{ + /** + * Parameter used to weight positive examples. + * @var float + */ + var $gamma = 1.0; + + /** + * Parameter used to weight negative examples. + * @var float + */ + var $epsilon = 1.0; + + /** + * Beta vector of feature weights resulting from the training phase. The + * dot product of this vector with a feature vector yields the log + * likelihood that the feature vector describes a document belonging to the + * trained-for class. + * @var array + */ + var $beta; + + /** + * Computes the beta vector from the given examples and labels. The + * examples are represented as a sparse matrix where each row is an example + * and each column a feature, and the labels as an array where each value + * is either 1 or -1, corresponding to a positive or negative example. Note + * that the first feature (column 0) corresponds to an intercept term, and + * is equal to 1 for every example. + * + * @param object $X SparseMatrix of training examples + * @param array $y example labels + */ + function train(SparseMatrix $X, $y) + { + $n = $X->columns(); + $p = array_fill(0, $n, 0); + $a = array_fill(0, $n, 0); + $this->beta = array_fill(0, $n, 0.0); + $beta =& $this->beta; + + foreach ($X as $i => $row) { + foreach ($row as $j => $Xij) { + if ($y[$i] == 1) { + $p[$j] += 1; + } else { + $a[$j] += 1; + } + } + } + + $beta[0] = $this->logit($p[0], $a[0]); + for ($j = 1; $j < $n; $j++) { + $beta[$j] = $this->logit($p[$j], $a[$j]) - $beta[0]; + } + } + + /** + * Constructs beta by sampling from the Gamma distribution for each + * feature, parameterized by the number of times the feature appears in + * positive examples, with a scale/rate of 1. This function is used to + * construct classifier committees. + * + * @param object $features Features instance for the training set, used to + * determine how often a given feature occurs in positive and negative + * examples + */ + function sampleBeta($features) + { + $p = array(); + $a = array(); + $n = $features->numFeatures(); + list($p[0], $a[0]) = $features->labelStats(); + for ($j = 1; $j <= $n; $j++) { + $stats = $features->varStats($j, 1); + list($t_l, $t_nl, $nt_l, $nt_nl) = $stats; + $p[$j] = $this->sampleGammaDeviate(1 + $t_l); + $a[$j] = $this->sampleGammaDeviate(1 + $t_nl); + } + + $this->beta = array(); + $beta =& $this->beta; + $beta[0] = $this->logit($p[0], $a[0]); + for ($j = 1; $j <= $n; $j++) { + $beta[$j] = $this->logit($p[$j], $a[$j]) - $beta[0]; + } + } + + /** + * Returns the pseudo-probability that a new instance is a positive example + * of the class the beta vector was trained to recognize. It only makes + * sense to try classification after at least some training + * has been done on a dataset that includes both positive and negative + * examples of the target class. + * + * @param array $x feature vector represented by an associative array + * mapping features to their weights + */ + function classify($x) + { + $beta =& $this->beta; + $l = 0.0; + foreach ($x as $j => $xj) { + /* + The $x values are in {-1,1} instead of {0,1}, so we just + manually skip what would be the zero terms. + */ + if ($xj == 1) + $l += $beta[$j]; + } + return 1.0 / (1.0 + exp(-$l)); + } + + /* PRIVATE INTERFACE */ + + /** + * Computes the log odds of a numerator and denominator, corresponding to + * the number of positive and negative examples exhibiting some feature. + * + * @param int $pos count of positive examples exhibiting some feature + * @param int $neg count of negative examples + * @return float log odds of seeing the feature in a positive example + */ + function logit($pos, $neg) + { + $odds = ($pos + $this->gamma) / ($neg + $this->epsilon); + return log($odds); + } + + /** + * Computes a Gamma deviate with beta = 1 and integral, small alpha. With + * these assumptions, the deviate is just the sum of alpha exponential + * deviates. Each exponential deviate is just the negative log of a uniform + * deviate, so the sum of the logs is just the negative log of the products + * of the uniform deviates. + * + * @param int $alpha parameter to Gamma distribution (in practice, a count + * of occurrences of some feature) + * @return float a deviate from the Gamma distribution parameterized by + * $alpha + */ + function sampleGammaDeviate($alpha) + { + $product = 1.0; + $randmax = getrandmax(); + for ($i = 0; $i < $alpha; $i++) { + $product *= rand() / $randmax; + } + return -log($product); + } +} diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index 3b857c390..140155cc8 100644 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -219,8 +219,9 @@ interface CrawlConstants const UI_FLAGS = 'cr'; const KEYWORD_LINKS = 'cs'; const END_ITERATOR = 'ct'; + const ACTIVE_CLASSIFIERS = 'cu'; + const ACTIVE_CLASSIFIERS_DATA = 'cv'; const NEEDS_OFFSET_FLAG = 0x7FFFFFFF; - } ?> diff --git a/lib/upgrade_functions.php b/lib/upgrade_functions.php index 50fa64fd9..655b40a4f 100644 --- a/lib/upgrade_functions.php +++ b/lib/upgrade_functions.php @@ -76,7 +76,7 @@ function upgradeDatabaseWorkDirectoryCheck() $result = @$model->db->execute($sql); if($result !== false) { $row = $model->db->fetchArray($result); - if(isset($row['ID']) && $row['ID'] >= 14) { + if(isset($row['ID']) && $row['ID'] >= 16) { return false; } else { return true; @@ -94,7 +94,7 @@ function upgradeDatabaseWorkDirectoryCheck() */ function upgradeDatabaseWorkDirectory() { - $versions = array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15); + $versions = array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16); $model = new Model(); $model->db->selectDB(DB_NAME); $sql = "SELECT ID FROM VERSION"; @@ -230,6 +230,7 @@ function upgradeDatabaseVersion3(&$db) 'Sắp xếp hoạt động dựa theo hoạch định')"); } + /** * Upgrades a Version 3 version of the Yioop! database to a Version 4 version * @param object $db datasource to use to upgrade @@ -477,7 +478,75 @@ function upgradeDatabaseVersion15(&$db) AND GROUP_ID=0"); $db->execute("INSERT INTO MIX_COMPONENTS VALUES( 3, 0, 1, 1, 'media:video site:doc')"); - $db->execute("INSERT INTO LOCALE VALUES (21, 'te', 'తెలుగు', 'lr-tb')"); + $db->execute("INSERT INTO LOCALE VALUES (21, 'te', + 'తెలుగు', 'lr-tb')"); + upgradeLocales(); +} + +/** + * Upgrades a Version 15 version of the Yioop! database to a Version 16 version + * @param object $db datasource to use to upgrade + */ +function upgradeDatabaseVersion16(&$db) +{ + $db->execute("DELETE FROM VERSION WHERE ID < 15"); + $db->execute("UPDATE VERSION SET ID=16 WHERE ID=15"); + + $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 12)"); + + $db->execute("UPDATE ACTIVITY + SET ACTIVITY_ID = ACTIVITY_ID + 100, + TRANSLATION_ID = TRANSLATION_ID + 100 + WHERE ACTIVITY_ID > 5 AND ACTIVITY_ID < 1000"); + + $db->execute("INSERT INTO ACTIVITY + VALUES (6, 6, 'manageClassifiers')"); + + $db->execute("UPDATE ACTIVITY + SET ACTIVITY_ID = ACTIVITY_ID - 99, + TRANSLATION_ID = TRANSLATION_ID - 99 + WHERE ACTIVITY_ID > 6 AND ACTIVITY_ID < 1000"); + + $db->execute("UPDATE sqlite_sequence SET seq = 12 + WHERE name = 'ACTIVITY'"); + + $db->execute("UPDATE TRANSLATION + SET TRANSLATION_ID = TRANSLATION_ID + 100 + WHERE TRANSLATION_ID > 5 AND TRANSLATION_ID < 1000"); + + $db->execute("INSERT INTO TRANSLATION + VALUES (6, 'db_activity_manage_classifiers')"); + + $db->execute("UPDATE TRANSLATION + SET TRANSLATION_ID = TRANSLATION_ID - 99 + WHERE TRANSLATION_ID > 6 AND TRANSLATION_ID < 1000"); + + $db->execute("UPDATE TRANSLATION_LOCALE + SET TRANSLATION_ID = TRANSLATION_ID + 100 + WHERE TRANSLATION_ID > 5 AND TRANSLATION_ID < 1000"); + + $db->execute("INSERT INTO TRANSLATION_LOCALE + VALUES (6, 1, 'Classifiers')"); + + $db->execute("UPDATE TRANSLATION_LOCALE + SET TRANSLATION_ID = TRANSLATION_ID - 99 + WHERE TRANSLATION_ID > 6 AND TRANSLATION_ID < 1000"); + + $old_archives_path = WORK_DIRECTORY."/cache/archives"; + $new_archives_path = WORK_DIRECTORY."/archives"; + if (file_exists($old_archives_path)) { + rename($old_archives_path, $new_archives_path); + } else { + mkdir($new_archives_path); + } + $db->setWorldPermissionsRecursive($new_archives_path); + + $new_classifiers_path = WORK_DIRECTORY."/classifiers"; + if (!file_exists($new_classifiers_path)) { + mkdir($new_classifiers_path); + } + $db->setWorldPermissionsRecursive($new_classifiers_path); + upgradeLocales(); } ?> diff --git a/locale/en-US/configure.ini b/locale/en-US/configure.ini index fdd3e7cb7..33451b1a8 100755 --- a/locale/en-US/configure.ini +++ b/locale/en-US/configure.ini @@ -28,409 +28,472 @@ ; ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//controllers ; -; admin_controller.php line: 138 +; admin_controller.php line: 140 admin_controller_login_successful = "Login Successful!!" ; -; admin_controller.php line: 143 +; admin_controller.php line: 145 admin_controller_login_failed = "Username or Password Incorrect!" ; -; admin_controller.php line: 148 +; admin_controller.php line: 150 admin_controller_login_to_config = "Login to continue configuration (default: u=root, p=)" ; -; admin_controller.php line: 152 +; admin_controller.php line: 154 admin_controller_status_updates_stopped = "Status updates have stopped." ; -; admin_controller.php line: 333 +; admin_controller.php line: 337 admin_controller_news_off = "Updates Off" ; -; admin_controller.php line: 334 +; admin_controller.php line: 338 admin_controller_news_update_web = "Web Update" ; -; admin_controller.php line: 335 +; admin_controller.php line: 339 admin_controller_news_process = "News Process" ; -; admin_controller.php line: 362 +; admin_controller.php line: 366 admin_controller_passwords_dont_match = "Typed passwords do not match." ; -; admin_controller.php line: 374 +; admin_controller.php line: 378 admin_controller_invalid_old_password = "Current password incorrect." ; -; admin_controller.php line: 381 +; admin_controller.php line: 385 admin_controller_change_password = "Password change successful!!" ; -; admin_controller.php line: 414 +; admin_controller.php line: 418 admin_controller_select_username = "Select Name" ; -; admin_controller.php line: 451 +; admin_controller.php line: 455 admin_controller_select_rolename = "Select Role" ; -; admin_controller.php line: 477 +; admin_controller.php line: 481 admin_controller_passwords_dont_match = "Typed passwords do not match." ; -; admin_controller.php line: 484 +; admin_controller.php line: 488 admin_controller_username_exists = "Cannot Create User As Username Exists" ; -; admin_controller.php line: 491 +; admin_controller.php line: 495 admin_controller_username_added = "User Created" ; -; admin_controller.php line: 500 +; admin_controller.php line: 504 admin_controller_username_doesnt_exists = "Username Does Not Exist" ; -; admin_controller.php line: 507 +; admin_controller.php line: 511 admin_controller_username_deleted = "User Deleted" ; -; admin_controller.php line: 514 +; admin_controller.php line: 518 admin_controller_username_doesnt_exists = "Username Does Not Exist" ; -; admin_controller.php line: 520 +; admin_controller.php line: 524 admin_controller_rolename_doesnt_exists = "Role Name Does not Exist" ; -; admin_controller.php line: 526 +; admin_controller.php line: 530 admin_controller_rolename_added = "Role Name Added" ; -; admin_controller.php line: 537 +; admin_controller.php line: 541 admin_controller_username_doesnt_exists = "Username Does Not Exist" ; -; admin_controller.php line: 543 +; admin_controller.php line: 547 admin_controller_rolename_doesnt_exists = "Role Name Does not Exist" ; -; admin_controller.php line: 553 +; admin_controller.php line: 557 admin_controller_rolename_deleted = "Role Name Deleted" ; -; admin_controller.php line: 583 +; admin_controller.php line: 587 admin_controller_select_rolename = "Select Role" ; -; admin_controller.php line: 618 +; admin_controller.php line: 622 admin_controller_select_activityname = "Select Activity" ; -; admin_controller.php line: 651 +; admin_controller.php line: 655 admin_controller_rolename_exists = "Role Name Exists" ; -; admin_controller.php line: 661 +; admin_controller.php line: 665 admin_controller_rolename_added = "Role Name Added" ; -; admin_controller.php line: 672 +; admin_controller.php line: 676 admin_controller_rolename_doesnt_exists = "Role Name Does not Exist" ; -; admin_controller.php line: 680 +; admin_controller.php line: 684 admin_controller_rolename_deleted = "Role Name Deleted" ; -; admin_controller.php line: 686 +; admin_controller.php line: 690 admin_controller_rolename_doesnt_exists = "Role Name Does not Exist" ; -; admin_controller.php line: 692 +; admin_controller.php line: 696 admin_controller_activityname_doesnt_exists = "Activity Name Does not Exist" ; -; admin_controller.php line: 702 +; admin_controller.php line: 706 admin_controller_activity_added = "Activity Added" ; -; admin_controller.php line: 708 +; admin_controller.php line: 712 admin_controller_rolename_doesnt_exists = "Role Name Does not Exist" ; -; admin_controller.php line: 715 +; admin_controller.php line: 719 admin_controller_activityname_doesnt_exists = "Activity Name Does not Exist" ; -; admin_controller.php line: 727 +; admin_controller.php line: 731 admin_controller_activity_deleted = "Activity Deleted" ; -; admin_controller.php line: 773 +; admin_controller.php line: 777 admin_controller_stop_crawl = "Stopping crawl. . .This will take a moment to refresh." ; -; admin_controller.php line: 787 +; admin_controller.php line: 791 admin_controller_resume_crawl = "Resuming crawl. . .This will take a moment to refresh." ; -; admin_controller.php line: 817 +; admin_controller.php line: 821 admin_controller_delete_crawl_success = "Deleting Crawl. . .This will take a moment to refresh." ; -; admin_controller.php line: 822 +; admin_controller.php line: 826 admin_controller_delete_crawl_fail = "Delete Crawl Failed!!" ; -; admin_controller.php line: 829 +; admin_controller.php line: 833 admin_controller_set_index = "Setting Crawl To Use as Index" ; -; admin_controller.php line: 854 +; admin_controller.php line: 858 admin_controller_starting_new_crawl = "Starting New Crawl!" ; -; admin_controller.php line: 864 +; admin_controller.php line: 868 admin_controller_no_description = "No Description for Crawl" ; -; admin_controller.php line: 973 +; admin_controller.php line: 981 admin_controller_use_below = "Use options below" ; -; admin_controller.php line: 974 +; admin_controller.php line: 982 admin_controller_use_defaults = "Use Yioop! defaults" ; -; admin_controller.php line: 976 +; admin_controller.php line: 984 admin_controller_use_below = "Use options below" ; -; admin_controller.php line: 980 +; admin_controller.php line: 988 admin_controller_previous_crawl = "Previous Crawl:" ; -; admin_controller.php line: 1067 +; admin_controller.php line: 1075 admin_controller_breadth_first = "Breadth First" ; -; admin_controller.php line: 1069 +; admin_controller.php line: 1077 admin_controller_page_importance = "Page Importance" ; -; admin_controller.php line: 1135 +; admin_controller.php line: 1143 admin_controller_urls_injected = "Urls Injected!" ; -; admin_controller.php line: 1146 +; admin_controller.php line: 1154 admin_controller_update_seed_info = "Updating Seed Site Info!" ; -; admin_controller.php line: 1238 +; admin_controller.php line: 1246 admin_controller_select_crawl = "Select Crawl" ; -; admin_controller.php line: 1239 +; admin_controller.php line: 1247 admin_controller_default_crawl = "Default Crawl" ; -; admin_controller.php line: 1241 +; admin_controller.php line: 1249 admin_controller_select_crawl = "Select Crawl" ; -; admin_controller.php line: 1243 +; admin_controller.php line: 1251 admin_controller_default_crawl = "Default Crawl" ; -; admin_controller.php line: 1270 +; admin_controller.php line: 1278 admin_controller_unnamed = "Unnamed Crawl" ; -; admin_controller.php line: 1275 +; admin_controller.php line: 1283 admin_controller_mix_created = "Crawl Mix Created!" ; -; admin_controller.php line: 1284 +; admin_controller.php line: 1292 admin_controller_set_index = "Setting Crawl To Use as Index" ; -; admin_controller.php line: 1294 +; admin_controller.php line: 1302 admin_controller_mix_doesnt_exists = "Mix to Delete Does not Exist!" ; -; admin_controller.php line: 1302 +; admin_controller.php line: 1310 admin_controller_mix_deleted = "Crawl Mix Deleted!" ; -; admin_controller.php line: 1338 +; admin_controller.php line: 1346 editmix_element_add_crawls = "Add Crawls" ; -; admin_controller.php line: 1340 +; admin_controller.php line: 1348 editmix_element_num_results = "Number of Results" ; -; admin_controller.php line: 1341 +; admin_controller.php line: 1349 editmix_element_del_grp = "Delete Group" ; -; admin_controller.php line: 1342 +; admin_controller.php line: 1350 editmix_element_weight = "Weight" ; -; admin_controller.php line: 1343 +; admin_controller.php line: 1351 editmix_element_name = "Name" ; -; admin_controller.php line: 1344 +; admin_controller.php line: 1352 editmix_add_keywords = "Keywords" ; -; admin_controller.php line: 1345 +; admin_controller.php line: 1353 editmix_element_actions = "Actions" ; -; admin_controller.php line: 1346 +; admin_controller.php line: 1354 editmix_add_query = "Add Query" ; -; admin_controller.php line: 1347 +; admin_controller.php line: 1355 editmix_element_delete = "Delete" ; -; admin_controller.php line: 1399 +; admin_controller.php line: 1407 admin_controller_mix_saved = "Crawl Mix Changes Saved!" ; -; admin_controller.php line: 1453 +; admin_controller.php line: 1458 admin_controller_use_below = "Use options below" ; -; admin_controller.php line: 1454 +; admin_controller.php line: 1459 admin_controller_use_defaults = "Use Yioop! defaults" ; -; admin_controller.php line: 1456 +; admin_controller.php line: 1461 admin_controller_use_below = "Use options below" ; -; admin_controller.php line: 1460 +; admin_controller.php line: 1465 admin_controller_previous_crawl = "Previous Crawl:" ; -; admin_controller.php line: 1465 +; admin_controller.php line: 1470 admin_controller_recrawl_never = "Never" ; -; admin_controller.php line: 1466 +; admin_controller.php line: 1471 admin_controller_recrawl_1day = "1 days" ; -; admin_controller.php line: 1467 +; admin_controller.php line: 1472 admin_controller_recrawl_2day = "2 days" ; -; admin_controller.php line: 1468 +; admin_controller.php line: 1473 admin_controller_recrawl_3day = "3 days" ; -; admin_controller.php line: 1469 +; admin_controller.php line: 1474 admin_controller_recrawl_7day = "7 days" ; -; admin_controller.php line: 1470 +; admin_controller.php line: 1475 admin_controller_recrawl_14day = "14 days" ; -; admin_controller.php line: 1657 +; admin_controller.php line: 1685 admin_controller_page_options_updated = "Page Options Updated!" ; -; admin_controller.php line: 1683 +; admin_controller.php line: 1711 admin_controller_page_options_running_tests = "Running Tests!" ; -; admin_controller.php line: 1800 +; admin_controller.php line: 1851 +admin_controller_new_classifier = "New classifier created." +; +; admin_controller.php line: 1854 +admin_controller_classifier_exists = "A classifier with that name already exists." +; +; admin_controller.php line: 1866 +admin_controller_no_classifier = "No classifier with that name." +; +; admin_controller.php line: 1884 +admin_controller_finalizing_classifier = "Finalizing classifier." +; +; admin_controller.php line: 1907 +admin_controller_classifier_deleted = "Classifier deleted." +; +; admin_controller.php line: 1911 +admin_controller_no_classifier = "No classifier with that name." +; +; admin_controller.php line: 1960 +admin_controller_classifier_exists = "A classifier with that name already exists." +; +; admin_controller.php line: 1971 +editclassifier_load_failed = "Failed to load documents" +; +; admin_controller.php line: 1973 +editclassifier_loading = "Loading" +; +; admin_controller.php line: 1975 +editclassifier_added_examples = "Added {1} {2} examples" +; +; admin_controller.php line: 1977 +editclassifier_label_update_failed = "Failed to update labels." +; +; admin_controller.php line: 1979 +editclassifier_updating = "Updating" +; +; admin_controller.php line: 1981 +editclassifier_acc_update_failed = "Failed to update accuracy" +; +; admin_controller.php line: 1983 +editclassifier_na = "N/A" +; +; admin_controller.php line: 1985 +editclassifier_no_docs = "No documents" +; +; admin_controller.php line: 1987 +editclassifier_num_docs = "{1}{2} documents" +; +; admin_controller.php line: 1989 +editclassifier_in_class = "In Class" +; +; admin_controller.php line: 1991 +editclassifier_not_in_class = "Not In Class" +; +; admin_controller.php line: 1993 +editclassifier_skip = "Skip" +; +; admin_controller.php line: 1995 +editclassifier_prediction = "Prediction: {1}" +; +; admin_controller.php line: 1997 +editclassifier_scores = "{1}%% confidence, {2}%% disagreement" +; +; admin_controller.php line: 2041 admin_controller_results_editor_update = "Filter Pages Updated!" ; -; admin_controller.php line: 1814 +; admin_controller.php line: 2055 admin_controller_edited_pages = "Select a Previously Edited URL" ; -; admin_controller.php line: 1827 +; admin_controller.php line: 2068 admin_controller_results_editor_need_url = "Result Page Update needs to Specify the URL!" ; -; admin_controller.php line: 1833 +; admin_controller.php line: 2074 admin_controller_results_editor_page_updated = "Result Page Updated!" ; -; admin_controller.php line: 1846 +; admin_controller.php line: 2087 admin_controller_results_editor_page_loaded = "Page Loaded!" ; -; admin_controller.php line: 1891 +; admin_controller.php line: 2132 admin_controller_select_machine = "Select Machine" ; -; admin_controller.php line: 1962 +; admin_controller.php line: 2203 admin_controller_machine_added = "Machine Added!" ; -; admin_controller.php line: 1969 +; admin_controller.php line: 2210 admin_controller_machine_exists = "Machine Name Already Exists; Please Delete First!" ; -; admin_controller.php line: 1973 +; admin_controller.php line: 2214 admin_controller_machine_incomplete = "Missing Fields From Machine Form!" ; -; admin_controller.php line: 1982 +; admin_controller.php line: 2223 admin_controller_machine_doesnt_exists = "Machine Name does not Exists!" ; -; admin_controller.php line: 1999 +; admin_controller.php line: 2240 admin_controller_stop_service_first = "Machine in use. Please stop the service running on it!" ; -; admin_controller.php line: 2012 +; admin_controller.php line: 2253 admin_controller_machine_deleted = "Machine Deleted!" ; -; admin_controller.php line: 2033 +; admin_controller.php line: 2274 admin_controller_news_process_running = "News Updater Seems To Be Running Already" ; -; admin_controller.php line: 2041 +; admin_controller.php line: 2282 admin_controller_news_mode_updated = "News Update Mode Changed!" ; -; admin_controller.php line: 2045 +; admin_controller.php line: 2286 admin_controller_news_update_failed = "News Update Mode Change Failed!" ; -; admin_controller.php line: 2108 +; admin_controller.php line: 2349 admin_controller_no_machine_log = "No Log File Found." ; -; admin_controller.php line: 2137 +; admin_controller.php line: 2378 admin_controller_machine_servers_updated = "Machine's Servers Updated!" ; -; admin_controller.php line: 2141 +; admin_controller.php line: 2382 admin_controller_machine_no_action = "Unable to Perform Action!" ; -; admin_controller.php line: 2174 +; admin_controller.php line: 2415 admin_controller_select_localename = "Select Locale" ; -; admin_controller.php line: 2218 +; admin_controller.php line: 2459 admin_controller_locale_added = "Locale Added!" ; -; admin_controller.php line: 2225 +; admin_controller.php line: 2466 admin_controller_localename_doesnt_exists = "Locale Does Not Exist!" ; -; admin_controller.php line: 2234 +; admin_controller.php line: 2475 admin_controller_localename_deleted = "Locale Deleted" ; -; admin_controller.php line: 2243 +; admin_controller.php line: 2484 admin_controller_select_staticpages = "Select a page" ; -; admin_controller.php line: 2262 +; admin_controller.php line: 2503 admin_controller_staticpage_updated = "Static Page Updated!" ; -; admin_controller.php line: 2289 +; admin_controller.php line: 2530 admin_controller_localestrings_updated = "Locale Strings Updated!" ; -; admin_controller.php line: 2346 +; admin_controller.php line: 2587 admin_controller_php_version = "PHP Version 5.3 or Newer" ; -; admin_controller.php line: 2354 +; admin_controller.php line: 2595 admin_controller_no_write_config_php = "configs/config.php not web server writable." ; -; admin_controller.php line: 2359 +; admin_controller.php line: 2600 admin_controller_no_write_work_dir = "Work directory needs to be writable by web server. " ; -; admin_controller.php line: 2364 +; admin_controller.php line: 2605 admin_controller_post_size_small = "php.ini file variable post_max_size should be at least 2M" ; -; admin_controller.php line: 2370 +; admin_controller.php line: 2611 admin_controller_missing_required = "The following required items were missing: %s" ; -; admin_controller.php line: 2393 +; admin_controller.php line: 2634 admin_controller_missing_optional = "The following optional items were missing: %s" ; -; admin_controller.php line: 2398 +; admin_controller.php line: 2639 admin_controller_check_passed = "Check Passed." ; -; admin_controller.php line: 2403 +; admin_controller.php line: 2644 admin_controller_using_local_config = "Using configs/local_config.php so changing work directory above may not work." ; -; admin_controller.php line: 2428 +; admin_controller.php line: 2669 admin_controller_media_kind = "Media Kind" ; -; admin_controller.php line: 2429 +; admin_controller.php line: 2670 admin_controller_video = "Video" ; -; admin_controller.php line: 2430 -admin_controller_rss_feed = "News Feed" +; admin_controller.php line: 2671 +admin_controller_rss_feed = "RSS" ; -; admin_controller.php line: 2444 +; admin_controller.php line: 2685 admin_controller_sources_indexes = "Index/Mix to Use" ; -; admin_controller.php line: 2497 +; admin_controller.php line: 2738 admin_controller_media_source_added = "Media Source Added!" ; -; admin_controller.php line: 2505 +; admin_controller.php line: 2746 admin_controller_media_source_deleted = "Media Source Deleted!" ; -; admin_controller.php line: 2521 +; admin_controller.php line: 2762 admin_controller_subsearch_added = "Subsearch Added!" ; -; admin_controller.php line: 2529 +; admin_controller.php line: 2770 admin_controller_subsearch_deleted = "Subsearch Deleted!" ; -; admin_controller.php line: 2604 +; admin_controller.php line: 2845 admin_controller_configure_use_absolute_path = "Must use an Absolute path for Work Directory" ; -; admin_controller.php line: 2616 +; admin_controller.php line: 2857 admin_controller_configure_diff_base_dir = "Work Directory cannot be contained in Yioop folder!" ; -; admin_controller.php line: 2649 +; admin_controller.php line: 2890 admin_controller_configure_work_dir_set = "Work Directory Set! You may need to re-login!" ; -; admin_controller.php line: 2663 +; admin_controller.php line: 2904 admin_controller_name_your_bot = "Please Name Your robot" ; -; admin_controller.php line: 2686 +; admin_controller.php line: 2927 admin_controller_configure_work_profile_made = "Working Directory and Profile Created!" ; -; admin_controller.php line: 2699 +; admin_controller.php line: 2940 admin_controller_configure_no_set_config = "Unable to Update config.php File!" ; -; admin_controller.php line: 2711 +; admin_controller.php line: 2952 admin_controller_configure_no_create_profile = "Unable to Create Profile!" ; -; admin_controller.php line: 2722 +; admin_controller.php line: 2963 admin_controller_configure_work_dir_invalid = "Work Directory is Invalid! Cannot Create Profile!" ; -; admin_controller.php line: 2734 +; admin_controller.php line: 2975 admin_controller_configure_work_dir_invalid = "Work Directory is Invalid! Cannot Create Profile!" ; -; admin_controller.php line: 2782 +; admin_controller.php line: 3023 admin_controller_configure_no_change_db = "Problem Updating Database!" ; -; admin_controller.php line: 2797 +; admin_controller.php line: 3038 admin_controller_configure_profile_change = "Profile Updated!" ; -; admin_controller.php line: 2812 +; admin_controller.php line: 3053 admin_controller_configure_no_change_profile = "There was a Problem Updating Profile!" ; -; admin_controller.php line: 2850 +; admin_controller.php line: 3091 admin_controller_describe_robot = "Please Describe Your Robot" ; ; machine_controller.php line: 182 @@ -448,55 +511,55 @@ search_controller_crawl_info = "Index: %s -- Size: %s pages/%s urls" ; search_controller.php line: 528 search_controller_search = "Search" ; -; search_controller.php line: 629 +; search_controller.php line: 636 search_controller_no_index_set = "No Search Index Set For Use!" ; -; search_controller.php line: 632 +; search_controller.php line: 639 search_controller_no_index_set = "No Search Index Set For Use!" ; -; search_controller.php line: 1430 +; search_controller.php line: 1438 search_controller_no_archive_page = "The website in question has requested this page not be archived." ; -; search_controller.php line: 1478 +; search_controller.php line: 1490 search_controller_original_page = "This image appeared on the page:" ; -; search_controller.php line: 1496 +; search_controller.php line: 1508 search_controller_extracted_title = "Extracted Title" ; -; search_controller.php line: 1498 +; search_controller.php line: 1510 search_controller_extracted_description = "Extracted Description" ; -; search_controller.php line: 1500 +; search_controller.php line: 1512 search_controller_extracted_links = "Extracted Links" ; -; search_controller.php line: 1505 +; search_controller.php line: 1517 search_controller_extracted_allow_paths = "Extracted Allowed To Crawl Paths" ; -; search_controller.php line: 1511 +; search_controller.php line: 1523 search_controller_extracted_disallow_paths = "Extracted Disallowed To Crawl Paths" ; -; search_controller.php line: 1517 +; search_controller.php line: 1529 search_controller_crawl_delay = "YioopBot Crawl Delay" ; -; search_controller.php line: 1590 +; search_controller.php line: 1600 search_controller_cache_comment = "Yioop Cache Page... This page has been modified to add a robots directive, make links absolute, add extracted summaries, and to highlight query terms." ; -; search_controller.php line: 1636 +; search_controller.php line: 1644 search_controller_cached_version = "This cached version of %s was obtained by the Yioop crawler on %s." ; -; search_controller.php line: 1751 +; search_controller.php line: 1759 search_controller_header_summaries = "Toggle Extracted Headers and Summaries" ; -; search_controller.php line: 1875 +; search_controller.php line: 1883 search_controller_history = "Toggle History" ; -; search_controller.php line: 2051 +; search_controller.php line: 2059 search_controller_all_cached = "All Cached Versions - Change Year and/or Months to see Links" ; -; search_controller.php line: 2082 +; search_controller.php line: 2090 search_controller_year = "Year:" ; -; search_controller.php line: 2083 +; search_controller.php line: 2091 search_controller_month = "Month:" ; ; settings_controller.php line: 134 @@ -779,6 +842,63 @@ crawloptions_element_need_api_for_mix = "Yioop API access required for mix archi ; crawloptions_element.php line: 167 crawloptions_element_save_options = "Save Options" ; +; editclassifier_element.php line: 63 +editclassifier_back = "Back" +; +; editclassifier_element.php line: 65 +editclassifier_edit_classifier = "Edit Classifier" +; +; editclassifier_element.php line: 77 +editclassifier_classifier_label = "Classifier Label:" +; +; editclassifier_element.php line: 82 +editclassifier_change = "Change" +; +; editclassifier_element.php line: 85 +editclassifier_statistics = "Statistics" +; +; editclassifier_element.php line: 86 +editclassifier_positive_examples = "Positive Examples:" +; +; editclassifier_element.php line: 89 +editclassifier_negative_examples = "Negative Examples:" +; +; editclassifier_element.php line: 92 +editclassifier_accuracy = "Accuracy:" +; +; editclassifier_element.php line: 97 +editclassifier_na = "N/A" +; +; editclassifier_element.php line: 102 +editclassifier_update = "Update" +; +; editclassifier_element.php line: 103 +editclassifier_add_examples = "Add Examples" +; +; editclassifier_element.php line: 107 +editclassifier_source = "Source:" +; +; editclassifier_element.php line: 111 +editclassifier_default_crawl = "Default Crawl" +; +; editclassifier_element.php line: 121 +editclassifier_label_by_hand = "Label By Hand" +; +; editclassifier_element.php line: 123 +editclassifier_all_in_class = "All In Class" +; +; editclassifier_element.php line: 125 +editclassifier_none_in_class = "None In Class" +; +; editclassifier_element.php line: 130 +editclassifier_keywords = "Keywords:" +; +; editclassifier_element.php line: 135 +editclassifier_load = "Load" +; +; editclassifier_element.php line: 141 +editclassifier_no_documents = "No Documents" +; ; editlocales_element.php line: 62 editlocales_element_back_to_manage = "Back" ; @@ -869,6 +989,48 @@ manageaccount_element_retype_password = "Retype Password: " ; manageaccount_element.php line: 84 manageaccount_element_save = "Save" ; +; manageclassifiers_element.php line: 58 +manageclassifiers_manage_classifiers = "Manage Classifiers" +; +; manageclassifiers_element.php line: 66 +manageclassifiers_classifier_name = "Classifier Name:" +; +; manageclassifiers_element.php line: 71 +manageclassifiers_create_button = "Create" +; +; manageclassifiers_element.php line: 75 +manageclassifiers_available_classifiers = "Available Classifiers" +; +; manageclassifiers_element.php line: 78 +manageclassifiers_label_col = "Label" +; +; manageclassifiers_element.php line: 79 +manageclassifiers_positive_col = "Positive" +; +; manageclassifiers_element.php line: 80 +manageclassifiers_negative_col = "Negative" +; +; manageclassifiers_element.php line: 82 +manageclassifiers_actions_col = "Actions" +; +; manageclassifiers_element.php line: 95 +manageclassifiers_edit = "Edit" +; +; manageclassifiers_element.php line: 98 +manageclassifiers_finalized = "Finalized" +; +; manageclassifiers_element.php line: 104 +manageclassifiers_finalize = "Finalize" +; +; manageclassifiers_element.php line: 106 +manageclassifiers_finalize = "Finalize" +; +; manageclassifiers_element.php line: 109 +manageclassifiers_finalizing = "Finalizing" +; +; manageclassifiers_element.php line: 115 +manageclassifiers_delete = "Delete" +; ; managecrawls_element.php line: 56 managecrawls_element_create_crawl = "Create Crawl" ; @@ -1076,121 +1238,127 @@ mixcrawl_search_index = "Search Index" ; mixcrawls_element.php line: 130 mixcrawls_view_delete = "Delete" ; -; pageoptions_element.php line: 66 +; pageoptions_element.php line: 63 pageoptions_element_crawl_time = "Crawl Time" ; -; pageoptions_element.php line: 71 +; pageoptions_element.php line: 68 pageoptions_element_search_time = "Search Time" ; -; pageoptions_element.php line: 76 +; pageoptions_element.php line: 73 pageoptions_element_test_options = "Test Options" ; -; pageoptions_element.php line: 88 +; pageoptions_element.php line: 85 pageoptions_element_load_options = "Get Page Options From:" ; -; pageoptions_element.php line: 93 +; pageoptions_element.php line: 90 pageoptions_element_page_range = "Byte Range to Download (0 - Value):" ; -; pageoptions_element.php line: 98 +; pageoptions_element.php line: 95 pageoptions_element_save_cache = "Cache whole crawled pages:" ; -; pageoptions_element.php line: 108 +; pageoptions_element.php line: 105 pageoptions_element_allow_recrawl = "Allow Page Recrawl After:" ; -; pageoptions_element.php line: 114 +; pageoptions_element.php line: 111 pageoptions_element_file_types = "Page File Types to Crawl:" ; -; pageoptions_element.php line: 142 +; pageoptions_element.php line: 141 +pageoptions_element_classifiers_to_apply = "Classifiers to Apply:" +; +; pageoptions_element.php line: 176 +pageoptions_element_no_classifiers = "No classifiers." +; +; pageoptions_element.php line: 179 pageoptions_element_indexing_plugins = "Indexing Plugins" ; -; pageoptions_element.php line: 145 +; pageoptions_element.php line: 183 pageoptions_element_plugin = "Plugin" ; -; pageoptions_element.php line: 148 +; pageoptions_element.php line: 186 pageoptions_element_plugin_include = "Use in Crawl" ; -; pageoptions_element.php line: 168 +; pageoptions_element.php line: 207 pageoptions_element_no_compatible_plugins = "No compatible indexing plugins found!" ; -; pageoptions_element.php line: 171 +; pageoptions_element.php line: 210 pageoptions_element_page_rules = "Page Field Extraction Rules" ; -; pageoptions_element.php line: 179 +; pageoptions_element.php line: 218 page_element_search_page = "Search Page Elements and Links" ; -; pageoptions_element.php line: 184 +; pageoptions_element.php line: 223 pageoptions_element_wd_suggest = "Word Suggest" ; -; pageoptions_element.php line: 192 +; pageoptions_element.php line: 231 pageoptions_element_subsearch_link = "Subsearch" ; -; pageoptions_element.php line: 201 +; pageoptions_element.php line: 240 pageoptions_element_signin_link = "Signin" ; -; pageoptions_element.php line: 208 +; pageoptions_element.php line: 247 pageoptions_element_cache_link = "Cache" ; -; pageoptions_element.php line: 216 +; pageoptions_element.php line: 255 pageoptions_element_similar_link = "Similar" ; -; pageoptions_element.php line: 224 +; pageoptions_element.php line: 263 pageoptions_element_in_link = "Inlinks" ; -; pageoptions_element.php line: 230 +; pageoptions_element.php line: 269 pageoptions_element_ip_link = "IP Address" ; -; pageoptions_element.php line: 239 +; pageoptions_element.php line: 278 pageoptions_element_ranking_factors = "Search Ranking Factors" ; -; pageoptions_element.php line: 242 +; pageoptions_element.php line: 281 pageoptions_element_title_weight = "Title Weight:" ; -; pageoptions_element.php line: 247 +; pageoptions_element.php line: 286 pageoptions_element_description_weight = "Description Weight:" ; -; pageoptions_element.php line: 252 +; pageoptions_element.php line: 291 pageoptions_element_link_weight = "Link Weight:" ; -; pageoptions_element.php line: 257 +; pageoptions_element.php line: 296 pageoptions_element_results_grouping_options = "Search Results Grouping" ; -; pageoptions_element.php line: 260 +; pageoptions_element.php line: 299 pageoptions_element_min_results_to_group = "Minimum Results to Group:" ; -; pageoptions_element.php line: 265 +; pageoptions_element.php line: 304 pageoptions_element_server_alpha = "Server Alpha:" ; -; pageoptions_element.php line: 273 +; pageoptions_element.php line: 312 pageoptions_element_test_page = "Test Page" ; -; pageoptions_element.php line: 275 +; pageoptions_element.php line: 314 pageoptions_element_page_type = "Type:" ; -; pageoptions_element.php line: 292 +; pageoptions_element.php line: 331 pageoptions_element_save_options = "Save" ; -; pageoptions_element.php line: 294 +; pageoptions_element.php line: 333 pageoptions_element_run_tests = "Test Process Page" ; -; pageoptions_element.php line: 300 +; pageoptions_element.php line: 339 pageoptions_element_test_results = "Test Results" ; -; pageoptions_element.php line: 303 +; pageoptions_element.php line: 342 pageoptions_element_after_process = "After page processor extracts summary" ; -; pageoptions_element.php line: 307 +; pageoptions_element.php line: 346 pageoptions_element_after_rules = "After page rules applied" ; -; pageoptions_element.php line: 311 +; pageoptions_element.php line: 350 pageoptions_element_extracted_words = "Words and positions extracted to index from summary" ; -; pageoptions_element.php line: 315 +; pageoptions_element.php line: 354 pageoptions_element_extracted_metas = "Extracted meta words" ; -; pageoptions_element.php line: 349 +; pageoptions_element.php line: 388 pageoptions_element_run_tests = "Test Process Page" ; -; pageoptions_element.php line: 353 +; pageoptions_element.php line: 392 pageoptions_element_save_options = "Save" ; ; resultseditor_element.php line: 58 diff --git a/models/crawl_model.php b/models/crawl_model.php index d87d44eec..227234cb3 100755 --- a/models/crawl_model.php +++ b/models/crawl_model.php @@ -36,6 +36,8 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** For base class*/ require_once BASE_DIR."/models/parallel_model.php"; +/** For deleting save points*/ +require_once BASE_DIR."/controllers/search_controller.php"; /** used to prevent cache page requests from being logged*/ if(!defined("POST_PROCESSING") && !defined("NO_LOGGING")) { @@ -347,9 +349,27 @@ class CrawlModel extends ParallelModel implements CrawlConstants $this->db->execute($sql); $sql = "DELETE FROM MIX_COMPONENTS WHERE MIX_TIMESTAMP='$timestamp'"; $this->db->execute($sql); - } + /** + * Deletes the archive iterator and savepoint files created during the + * process of iterating through a crawl mix. + * + * @param int $timestamp The timestamp of the crawl mix + */ + function deleteCrawlMixIteratorState($timestamp) + { + global $INDEXING_PLUGINS; + setLocaleObject(getLocaleTag()); + $searchController = new SearchController($INDEXING_PLUGINS); + $searchController->clearQuerySavepoint($timestamp); + + $archive_dir = WORK_DIRECTORY."/schedules/". + self::name_archive_iterator.$timestamp; + if (file_exists($archive_dir)) { + $this->db->unlinkRecursive($archive_dir); + } + } /** * Returns the initial sites that a new crawl will start with along with @@ -450,6 +470,14 @@ EOT; } $n[] = ""; + $n[] = "[active_classifiers]"; + if(isset($info['active_classifiers'])) { + foreach ($info['active_classifiers']['label'] as $label) { + $n[] = "label[] = '$label';"; + } + } + $n[] = ""; + $site_types = array('allowed_sites' => 'url', 'disallowed_sites' => 'url', 'seed_sites' => 'url', 'page_rules'=>'rule'); @@ -589,7 +617,9 @@ EOT; "disallowed_sites" => array(self::DISALLOWED_SITES, 'url'), "page_rules" => array(self::PAGE_RULES, 'rule'), "indexed_file_types" => array(self::INDEXED_FILE_TYPES, - "extensions") + "extensions"), + "active_classifiers" => array(self::ACTIVE_CLASSIFIERS, + 'label') ); foreach($updatable_site_info as $type => $info) { if(isset($new_info[$type][$info[1]])) { @@ -900,7 +930,7 @@ EOT; $list[] = $crawl; } if($return_arc_bundles) { - $dirs = glob(CRAWL_DIR.'/cache/archives/*', GLOB_ONLYDIR); + $dirs = glob(CRAWL_DIR.'/archives/*', GLOB_ONLYDIR); foreach($dirs as $dir) { $crawl = array(); $crawl['CRAWL_TIME'] = crc32($dir); diff --git a/models/parallel_model.php b/models/parallel_model.php index 143c72834..ebe8ca04f 100755 --- a/models/parallel_model.php +++ b/models/parallel_model.php @@ -443,15 +443,25 @@ class ParallelModel extends Model implements CrawlConstants */ function clearQuerySavePoint($save_timestamp, $machine_urls = NULL) { + /* + It's important to quit early in the case that the timestamp is + empty, as this could result in deleting all SavePoint* files below. + */ + if (!$save_timestamp) return; + if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $this->execMachines("clearQuerySavePoint", $machine_urls, $save_timestamp); return; } - $save_files = glob(CRAWL_DIR.'/schedules/'.self::save_point. - $save_timestamp."*.txt"); - foreach($save_files as $save_file) { + /* + SavePoint files have a $qpart tagged on to the timestamp to + distinguish between parts of a query, so we want to delete anything + that starts with the appropriate timestamp. + */ + $save_stub = CRAWL_DIR.'/schedules/'.self::save_point.$save_timestamp; + foreach (glob($save_stub.'*.txt') as $save_file) { @unlink($save_file); } } diff --git a/models/phrase_model.php b/models/phrase_model.php index d59aaab1e..7a17ba84d 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -92,7 +92,7 @@ class PhraseModel extends ParallelModel 'filetype:', 'info:', '\-', 'os:', 'server:', 'date:', "numlinks:", 'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:', 'time:', 'code:', 'lang:', 'media:', 'elink:', 'location:', 'size:', 'host:', 'dns:', - 'path:', 'robot:', 'safe:', 'guid:'); + 'path:', 'robot:', 'safe:', 'guid:', 'class:', 'class-score:'); /** * Number of pages to cache in one go in memcache or filecache @@ -401,7 +401,7 @@ class PhraseModel extends ParallelModel } if(isset($total_rows)) { $results['TOTAL_ROWS'] = $total_rows; - } else { + } else if (isset($results['PAGES'])) { $results['TOTAL_ROWS'] = count($results['PAGES']); } diff --git a/models/profile_model.php b/models/profile_model.php index 3e953e8ad..b9cfb4ab2 100644 --- a/models/profile_model.php +++ b/models/profile_model.php @@ -80,10 +80,11 @@ class ProfileModel extends Model { $to_make_dirs = array($directory, "$directory/app", - "$directory/cache", "$directory/data", "$directory/feeds", - "$directory/locale", "$directory/log", - "$directory/prepare", "$directory/schedules", - "$directory/search_filters", "$directory/temp"); + "$directory/archives", "$directory/cache", + "$directory/classifiers", "$directory/data", "$directory/feeds", + "$directory/locale", "$directory/log", "$directory/prepare", + "$directory/schedules", "$directory/search_filters", + "$directory/temp"); $dir_status = array(); foreach($to_make_dirs as $dir) { $dir_status[$dir] = $this->createIfNecessaryDirectory($dir); diff --git a/scripts/classifiers.js b/scripts/classifiers.js new file mode 100644 index 000000000..8ed1b9131 --- /dev/null +++ b/scripts/classifiers.js @@ -0,0 +1,840 @@ +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage javascript + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013s + * @filesource + */ + +/** + * Implements the client interface for finding and labeling documents. + * + * Classifier behaves like a static class with some private variables and + * functions. The setup work is all done in the intitialize method, and after + * that all work is done in response to timeouts or user actions, such as + * button clicks. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage javascript + */ +var Classifier = (function() { + /** + * Maximum size of the document candidate pool. This constant is used to + * decide when to display, e.g., 50+ instead of just 50. + * @var int + */ + var MAX_UNLABELLED_BUFFER_SIZE = 51; + + /** + * The maximum number of previously-labeled document records to display. + * @var int + */ + var MAX_LABELLED = 20; + + /** + * How long to wait before adding another '.' to the end of a loading + * message. The advantage of choosing 333 is that the time to display three + * periods is roughly one second. + * @var int + */ + var LOADING_REDRAW = 333; + + // We return this at the bottom, so this is Classifier's public interface. + var self = {}; + + /** + * Gathers references to all relevant DOM elements, initializes state, and + * adds event handlers. Because AJAX requests to the administrative areas + * of Yioop must be authenticated, this method expects to be called with + * its first authentication token; each request will then yield a new token + * good for one more request. + * + * @param string classLabel label for the classifier being trained + * @param string authSession authentication token good for one request + * @param int authTime timestamp associated with the auth token + */ + self.initialize = function(classLabel, authSession, authTime) + { + self.classLabel = classLabel; + self.authTime = authTime; + self.authSession = authSession; + + self.elt = { + 'positive_count': elt('positive-count'), + 'negative_count': elt('negative-count'), + 'accuracy': elt('accuracy'), + 'update_accuracy': elt('update-accuracy'), + 'label_docs_form': elt('label-docs-form'), + 'label_docs_source': elt('label-docs-source'), + 'label_docs_type': elt('label-docs-type'), + 'label_docs_keywords': elt('label-docs-keywords'), + 'label_docs_status': elt('label-docs-status'), + 'label_docs_queue': null, + }; + + self.docCounter = 1; + self.documents = {}; + self.activeDocument = null; + self.labelledDocQueue = []; + self.lastSource = null; + self.lastSourceType = null; + self.lastKeywords = null; + self.lastStatus = ''; + self.loadingTimer = null; + + self.elt.label_docs_form.onsubmit = function() { + self.requestDocuments(); + return false; + } + + self.elt.update_accuracy.onclick = function() { + if (!hasClass('disabled', self.elt.update_accuracy)) { + self.requestAccuracyUpdate(); + } + return false; + } + }; + + /** + * Event handler called when the user clicks on any of the "In class", "Not + * in class", and "Skip" links associated with a document. This method + * updates the display and sends a request to the server to inform it of + * the user's decision and get the next document to be labeled. + * + * @param int docid key for the associated document + * @param string action 'inclass', 'notinclass', or 'skip' + */ + self.handleAction = function(docid, action) + { + var doc = self.documents[docid]; + var label; + switch (action) + { + case 'inclass': + label = 1; + break; + case 'notinclass': + label = -1; + break; + case 'skip': + label = 0; + break; + } + + // Only send a request if something has changed. + if (doc.label === undefined || doc.label != label) { + self.sendNewLabel(doc, label); + } + + // Update the class for benefit of the CSS + doc.element.className = 'labelled ' + action; + doc.label = label; + + /* + If the labelled (or skipped) document was the active document, then + push it down on the labeled queue, shifting off the oldest document + if the queue is full. + */ + if (doc == self.activeDocument) { + self.activeDocument = null; + if (self.labelledDocQueue.length == MAX_LABELLED) { + var droppedDoc = self.labelledDocQueue.shift(); + droppedDoc.element.parentNode.removeChild(droppedDoc.element); + } + self.labelledDocQueue.push(doc); + } + + return false; + } + + /* PRIVATE INTERFACE */ + + /** + * Sends a request to load up a new candidate pool based on the selected + * index, index action, and optional query. The response behavior differs + * according to whether the index action specifies marking all candidates + * as positive or negative examples, or manual labeling. In the latter case + * the number of candidate documents (up to MAX_UNLABELLED_BUFFER_SIZE) is + * displayed, while in the former case the number of documents added to the + * pool is displayed. + */ + self.requestDocuments = function() + { + self.lastSource = self.elt.label_docs_source.value; + self.lastSourceType = self.elt.label_docs_type.value; + self.lastKeywords = self.elt.label_docs_keywords.value; + + var loading = loadingText(self.elt.label_docs_status, + tl['editclassifier_loading']); + + sendRequest({ + 'url': '?c=classifier&a=classify&arg=getdocs', + 'postdata': { + 'session': self.authSession, + 'time': self.authTime, + 'label': self.classLabel, + 'index': self.lastSource, + 'type': self.lastSourceType, + 'keywords': self.lastKeywords + }, + 'onSuccess': function(response) { + loading.clear(); + self.authSession = response.authSession; + self.authTime = response.authTime; + self.clearActiveDocument(); + if (response.new_doc) { + self.setActiveDocument(response.new_doc); + } + if (response.add_count) { + // Only present when mass-labeling. + msg = format(tl['editclassifier_added_examples'], + response.add_count, self.lastSourceType); + self.setStatus(msg); + self.drawStatistics(response); + } else { + self.drawDocumentCount(response.num_docs); + } + }, + 'onFailure': function() { + loading.clear(); + self.setStatus(tl['editclassifier_load_failed']); + } + }); + } + + /* + * Encodes any labels stored in the labels var as POST data, and sends a + * request to add these labels (using the document url as a key) to + * the classifier controller on the server. This method is called by the + * handleAction method in order to actually send the new label (or skip) to + * the server. + * + * @param object doc document to send a label for + * @param int label user-assigned label + */ + self.sendNewLabel = function(doc, label) + { + var loading = loadingText(self.elt.label_docs_status, + tl['editclassifier_loading']); + sendRequest({ + 'url': '?c=classifier&a=classify&arg=addlabel', + 'postdata': { + 'session': self.authSession, + 'time': self.authTime, + 'label': self.classLabel, + 'index': self.lastSource, + 'type': self.lastSourceType, + 'keywords': self.lastKeywords, + 'doc_to_label': { + 'docid': doc.id, + 'key': doc.key, + 'label': label + } + }, + 'onSuccess': function(response) { + loading.clear(); + self.authSession = response.authSession; + self.authTime = response.authTime; + if (response.new_doc) { + /* + There may still be an active document in the case that + we were re-labelling an old document, but now we want to + replace it. + */ + self.clearActiveDocument(); + self.setActiveDocument(response.new_doc); + } + self.drawStatistics(response); + self.drawDocumentCount(response.num_docs); + }, + 'onFailure': function() { + loading.clear(); + self.setStatus(tl['editclassifier_label_update_failed']); + } + }); + + } + + /** + * Sends a request to the server to initiate an accuracy update, and on + * response updates the statistics (which includes reporting the current + * accuracy estimate, if any). Normally, the accuracy is only estimated + * each time a set number of documents have been added to the training set. + * The update accuracy functionality lets the user request an update + * without having to actually add more documents. + */ + self.requestAccuracyUpdate = function() + { + var updating = tl['editclassifier_updating']; + var loading = loadingText(self.elt.update_accuracy, updating, { + 'dots': false, + 'className': 'disabled' + }); + sendRequest({ + 'url': '?c=classifier&a=classify&arg=updateaccuracy', + 'postdata': { + 'session': self.authSession, + 'time': self.authTime, + 'label': self.classLabel, + 'index': self.lastSource, + 'type': self.lastSourceType, + 'keywords': self.lastKeywords, + }, + 'onSuccess': function(response) { + self.authSession = response.authSession; + self.authTime = response.authTime; + self.drawStatistics(response); + loading.clear(); + }, + 'onFailure': function() { + loading.clear(); + self.setStatus(tl['editclassifier_acc_update_failed']); + } + }); + } + + /** Builds and displays a new active document record for the document data + * received from the server. This method both registers the document data + * in internal data structures, and creates the DOM structure to display + * the document to the user. If this is the very first document to be + * labeled since page load, then the table that holds documents is created + * before the new document is inserted into the DOM. + * + * @param object doc data structure representing the new active document + */ + self.setActiveDocument = function(doc) { + doc.id = self.docCounter++; + self.documents[doc.id] = doc; + self.activeDocument = doc; + + // Create table if it doesn't yet exist. + if (!self.elt.label_docs_queue) { + var queue = document.createElement('table'); + queue.id = 'label-docs-queue'; + self.elt.label_docs_form.parentNode.insertBefore( + queue, self.elt.label_docs_form.nextElementSibling); + self.elt.label_docs_queue = queue; + } + + var newRow = self.buildDocumentRow(doc); + doc.element = newRow; + + var topDoc = self.elt.label_docs_queue.firstChild; + if (topDoc) { + self.elt.label_docs_queue.insertBefore(newRow, topDoc); + } else { + self.elt.label_docs_queue.appendChild(newRow); + } + } + + /** + * Removes the active document from the DOM and from the internal set of + * documents completely. This is done when abandoning the current candidate + * pool for another, and is NOT the same as skipping the active document. + */ + self.clearActiveDocument = function() + { + if (self.activeDocument) { + var topDoc = self.activeDocument.element; + self.elt.label_docs_queue.removeChild(topDoc); + delete self.documents[self.activeDocument.id]; + } + self.activeDocument = null; + } + + /** + * Updates the display of the counts of positive and negative examples and + * the estimated accuracy. Each time the server responds to a request, it + * passes along the classifier's current counts and accuracy estimate to + * keep the client presentation of these statistics in sync. + * + * @param object response data from the last server request + */ + self.drawStatistics = function(response) + { + self.elt.positive_count.innerHTML = response.positive; + self.elt.negative_count.innerHTML = response.negative; + if (response.accuracy === null) { + self.elt.accuracy.innerHTML = tl['editclassifier_na']; + } else { + self.elt.accuracy.innerHTML = format('{1}%', + (response.accuracy * 100).toFixed(1)); + } + } + + /** + * Updates the display of the number of documents currently in the + * candidate pool. Since candidates are being iterated over on the server + * rather than loaded in all at once, it is unknown exactly how many there + * are until the pool has been exhausted. To reflect this situation, when + * there are more candidates than will fit in the current pool, a plus sign + * is appended to the current count. + * + * @param int num_docs number of documents in the server's candidate pool + */ + self.drawDocumentCount = function(num_docs) + { + var msg; + if (!num_docs) { + msg = tl['editclassifier_no_docs']; + } else { + var count, plus; + if (num_docs == MAX_UNLABELLED_BUFFER_SIZE) { + count = MAX_UNLABELLED_BUFFER_SIZE - 1; + plus = '+'; + } else { + count = num_docs; + plus = ''; + } + msg = format(tl['editclassifier_num_docs'], count, plus); + } + self.setStatus(msg); + } + + /** + * A shortcut for setting the HTML of the element that displays document + * counts. + */ + self.setStatus = function(msg) + { + self.elt.label_docs_status.innerHTML = msg; + } + + /** + * Builds the DOM element representing a document. Each document is + * represented by a row in a table, where the row has two cells, the first + * dedicated to action links (e.g., for marking a document as a positive + * example) and the second to summarizing the document. + * + * @param object doc data structure representing the new document + * @return object table row DOM element representing the document + */ + self.buildDocumentRow = function(doc) + { + var tr = document.createElement('tr'); + tr.id = 'doc-' + doc.id; + tr.innerHTML = + tags('td', {'class': 'actions'}, + self.buildActionLinkHTML(tl['editclassifier_in_class'], + 'inclass', doc), + self.buildActionLinkHTML(tl['editclassifier_not_in_class'], + 'notinclass', doc), + self.buildActionLinkHTML(tl['editclassifier_skip'], + 'skip', doc) + ) + + tags('td', {'class': 'info'}, + tags('p', {'class': 'page-link'}, + tags('a', {'href': doc.cache_link}, doc.title)), + tags('p', {'class': 'echo-link'}, doc.url), + tags('p', {'class': 'prediction'}, + self.buildPredictionHTML(doc)), + doc.description && doc.description.length > 0 ? + tags('p', {'class': 'description'}, doc.description) : + '' + ); + return tr; + } + + /** + * Builds an anchor element used to allow a user to mark a document as a + * positive or negative example, or to skip it. The anchor has an onclick + * attribute that calls the handleAction method with the document id and + * action. + * + * @param string label anchor text displayed to the user + * @param string action action associated with this anchor + * @param object doc data structure representing the document the action + * should be applied to + * @return object paragraph DOM element wrapping the created anchor + */ + self.buildActionLinkHTML = function(label, action, doc) + { + var onclick = 'return Classifier.handleAction(' + doc.id + + ",'" + action + "')"; + var link = tags('a', { + 'class': action, + 'href': '#' + action, + 'onclick': onclick + }, label); + return tags('p', {}, '[', link, ']'); + } + + /** + * Builds an HTML string that displays the classification confidence and + * disagreement score associated with a document, using data sent from the + * server. + * + * @param object doc data structure representing the document + * @return string HTML string to be used to display confidence and + * disagreement + */ + self.buildPredictionHTML = function(doc) + { + label = (doc.positive ? '' : 'not ') + self.classLabel; + var prediction = format(tl['editclassifier_prediction'], label); + var scores = format(tl['editclassifier_scores'], + (doc.confidence * 100).toFixed(1), + (doc.disagreement * 100).toFixed(1)); + return format('<b>{1}</b> ({2})', prediction, scores); + } + + /* UTILITY FUNCTIONS */ + + /** + * Builds a string containing a pair of HTML tags with optional attributes + * and nested elements. All arguments but the tag name are optional, but if + * nested elements are to be supplied, then attributes for the opening tag + * must be supplied as well, even if they're empty. Attributes are + * specified as an object where the keys are attribute names and their + * values are strings. Each nested element may be either an HTML string or + * an array of HTML strings, all of which will be concatenated together. + * This function creates ONLY closed HTML tags (e.g., <td>...</td>, and not + * <img.../>); the tag function should be used to create self-closing HTML + * tags. + * + * @param string tagname opening and closing tag name + * @param object attributes optional object for which the keys are + * attribute names, and the values are attribute values (may be empty) + * @param string|array nested... optional sequence of HTML strings or + * arrays of HTML strings to be nested within the opening and closing tags + * @return string HTML string for the described element + */ + function tags(tagname, attributes /* ... */) + { + var element = [makeOpenTag(tagname, attributes, '>')]; + for (var i = 2; i < arguments.length; i++) { + var type = typeof(arguments[i]); + switch (type) + { + case 'object': + element = element.concat(arguments[i]); + break; + case 'string': + if (arguments[i].length > 0) + element.push(arguments[i]); + break; + } + } + element.push('</' + tagname + '>'); + return element.join(''); + } + + /** + * This function is just like the tags function, but creates a self-closing + * tag (e.g., <img.../>), which by necessity cannot contain nested + * elements. + * + * @param string tagname opening tag name + * @param object attributes optional object for which the keys are + * attribute names, and the values are attribute values (may be empty) + * @return string HTML string for the described element + */ + function tag(tagname, attributes) + { + return makeOpenTag(tagname, attributes, ' />'); + } + + /** + * A utility function to construct the opening tag of an HTML element, or a + * self-closing tag, along with optional attributes. + * + * @param string tagname opening tag name + * @param object attributes optional object for which the keys are + * attribute names, and the values are attribute values (may be empty) + * @return string HTML string for the opening (or self-closing) tag + */ + function makeOpenTag(tagname, attributes, endtag) + { + var tag = ['<' + tagname]; + if (attributes) { + for (key in attributes) { + tag.push(' ' + key + '=' + '"' + attributes[key] + '"'); + } + } + tag.push(endtag); + return tag.join(''); + } + + /** + * A simple string formatter that substitutes string arguments into a + * template string. The template string should contain substrings with the + * pattern '{\d+}' (e.g., {1}, {2}, ...), which will be replaced with the + * corresponding arguments passed to the format function. For example, any + * occurrence of '{1}' will be replaced by the first argument after the + * template string. + * + * @param string template template string that optionally contains sentinel + * sequences of the form '{\d+}' to be replaced + * @param string arg... positional arguments to be substituted into the + * template string + * @return string the template string with each sentinel pattern replaced + * by the appropriate argument + */ + function format(template /* ... */) + { + var args = arguments; + return template.replace(/\{(\d+)\}/g, function(match, i) { + var arg = args[parseInt(i)]; + return typeof arg == 'object' ? JSON.stringify(arg) : arg; + }); + } + + /** + * Builds an XmlHttpRequest with optional POST data to be sent to the + * server, and calls the appropriate continuation function when the request + * completes or fails. The request is carried out asynchronously, and the + * response handlers are defined by the onSuccess and onFailure keys of the + * options object passed into this function. If the response content-type + * is set to application/json, then the response is JSON-decoded before + * being passed to the onSuccess handler. The options object supports the + * following keys: + * + * string url: URL to send the request to (required) + * + * string method: HTTP method to use (default GET, but changes to POST + * if postdata is specified without also setting the method) + * + * object postdata: object containing key/value pairs of POST arguments + * to be sent with the request; the values are automatically + * URI-encoded (optional) + * + * function onSuccess: function to be called upon the completion of a + * successful request; the response body is passed as the first and + * only argument, JSON-decoded if the response content-type was + * application/json (optional) + * + * function onFailure: function called if the request times out or + * otherwise can't be completed (optional) + * + * Example: + * + * sendRequest({ + * 'url': '?c=classifier&a=classify&arg=getdocs', + * 'postdata': { + * 'time': self.authTime, + * 'session': self.authSession, + * 'label': self.classLabel, + * 'mix': label_docs_source.value + * 'keywords': label_docs_keywords.value + * }, + * 'onSuccess': function(response) { + * ... + * }, + * 'onFailure': function() { + * ... + * } + * }); + * + * @param object options request options. + */ + function sendRequest(options) + { + if (!options.url) { + throw "sendRequest: 'url' option is required" + } + + var method = options.method || 'GET'; + var onSuccess = options.onSuccess || function() {}; + var onFailure = options.onFailure || function() {}; + + var request = makeRequest(); + if (!request) { + onFailure(); + return false; + } + + request.onreadystatechange = function() { + if (request.readyState == 4 && request.status == 200) { + var response = request.responseText; + var type = request.getResponseHeader('content-type'); + if (type.match(/application\/json/)) { + response = JSON.parse(response); + } + onSuccess(response); + } + } + + if (options.postdata) { + var postdata = buildQueryString(options.postdata); + if (!options.method) { + method = 'POST'; + } + } + + request.open(method, options.url, true); + + if (postdata) { + request.setRequestHeader("Content-type", + "application/x-www-form-urlencoded"); + request.send(postdata); + } else { + request.send(); + } + } + + /** + * Recursively builds a query string from an object, URI-encoding any + * strings. Nested objects are handled using the standard HTTP notation for + * nested arrays; for example, the element accessed in object notation by + * a.b.c would be converted to a[b][c] in the query string. + * + * @param object obj optionally-nested object to be converted to a query + * string + * @param string prefix optional prefix to prepend to keys in obj (used in + * recursive calls) + * @return string query string representation of obj + */ + function buildQueryString(obj, prefix) + { + var str = []; + for (var p in obj) { + p = encodeURIComponent(p); + var k = prefix ? prefix + "[" + p + "]" : p; + v = obj[p]; + str.push(typeof v == "object" ? + buildQueryString(v, k) : + encodeURIComponent(k) + "=" + encodeURIComponent(v)); + } + return str.join("&"); + } + + /** + * Removes a particular class from the passed-in element if it's present; + * otherwise does nothing. + * + * @param string className class name to remove + * @param object el DOM object to modify + */ + function removeClass(className, el) + { + var re = RegExp('(^| )'+className+'( |$)'); + el.className = el.className.replace(re, '$1'); + } + + /** + * Adds a particular class to the passed-in element; if the element already + * has the class then it is deleted and the re-added, which should have no + * significant effect. + * + * @param string className class name to add + * @param object el DOM object to modify + */ + function addClass(className, el) + { + removeClass(className, el); + el.className += ' ' + className; + } + + /** + * Returns true if the passed in element has a particular class, and false + * otherwise. + * + * @param string className the class to check for + * @param object el DOM object to query + * @return bool true if el has class className, and false otherwise + */ + function hasClass(className, el) + { + var re = RegExp('(^| )'+className+'( |$)'); + return el.className.search(re) != -1; + } + + /** + * Places an element into a loading state, optionally adding a class and + * setting some text, and provides a method to call in order to cancel the + * loading state. The basic use case is to replace some text element with + * 'Loading...' text at the beginning of an asynchronous request, then to + * revert back to the pre-loading state once the request completes. This + * function returns an object with a clear method, which may be called in + * order to cancel the loading state. The options object may contain the + * following fields: + * + * bool dots: whether to automatically append dots to the loading text + * with the passage of a set time interval; the dots start over + * each time they reach three (default true) + * + * int dotsInterval: how long to wait before drawing the next dot + * (default 333ms) + * + * string className: class name to add to the element when loading + * starts, and to remove when it completes (default none) + * + * Example: + * + * var loading = loadingText(el, 'Loading'); + * someAsynchronousAction({ + * onComplete: function() { + * loading.clear(); + * ... + * } + * }); + * + * @param object el DOM object to be manipulated + * @param string text loading text with which to replace el's innerHTML + * @param object options loading options + * @return object object with a clear method, which can be called in order + * to cancel the loading state, restoring everything to the way it was + * before loading started + */ + function loadingText(el, text, options) + { + if (options == undefined) { + options = {}; + } + var oldHTML = el.innerHTML; + var drawDots = options.dots !== false; + var interval = options.dotsInterval || 333; + var timer; + if (drawDots) { + timer = window.setInterval(function() { + if (el.innerHTML.match(/\.{3}$/)) { + el.innerHTML = text; + } else { + el.innerHTML += '.'; + } + }, interval); + } + if (options.className) { + addClass(options.className, el); + } + el.innerHTML = text; + return obj = { + 'clear': function() { + if (drawDots) { + window.clearInterval(timer); + } + el.innerHTML = oldHTML; + if (options.className) { + removeClass(options.className, el); + } + } + }; + } + + return self; +})(); diff --git a/views/admin_view.php b/views/admin_view.php index 5333b156c..e9f3687c0 100755 --- a/views/admin_view.php +++ b/views/admin_view.php @@ -54,8 +54,9 @@ class AdminView extends View var $elements = array("language", "activity", "signin", "managecrawls", "manageaccount", "manageusers", "manageroles", "mixcrawls", "managelocales", "editlocales", "crawloptions", - "editmix", "pageoptions", "resultseditor", "searchsources", - "managemachines", "machinelog", "editstatic", "configure"); + "editmix", "pageoptions", "manageclassifiers", "editclassifier", + "resultseditor", "searchsources", "managemachines", "machinelog", + "editstatic", "configure"); /** Names of helper objects that the view uses to help draw itself * @var array */ diff --git a/views/elements/editclassifier_element.php b/views/elements/editclassifier_element.php new file mode 100644 index 000000000..431e2cedf --- /dev/null +++ b/views/elements/editclassifier_element.php @@ -0,0 +1,149 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage element + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * This element renders the initial edit page for a classifier, where the user + * can update the classifier label and find documents to label and add to the + * training set. The page displays some initial statistics and a form for + * finding documents in any existing index, but after that it is heavily + * modified by JavaScript in response to user actions and XmlHttpRequests + * made to the server. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage element + */ +class EditclassifierElement extends Element +{ + /** + * Draws the "edit classifier" element to the output buffers. + * + * @param array $data used to pass the class label, classifier instance, + * and list of existing crawls + */ + function render($data) + { + $classifier = $data['classifier']; + ?> + <div class="current-activity"> + <div class="<?php e($data['leftorright']);?>"> + <a href="?c=admin&a=manageClassifiers&<?php + e(CSRF_TOKEN.'='.$data[CSRF_TOKEN]) ?>"><?php + e(tl('editclassifier_back')) ?></a> + </div> + <h2><?php e(tl('editclassifier_edit_classifier')) ?></h2> + <form id="classifierForm" method="get" action=""> + <input type="hidden" name="c" value="admin" /> + <input type="hidden" name="<?php e(CSRF_TOKEN); ?>" value="<?php + e($data[CSRF_TOKEN]); ?>" /> + <input type="hidden" name="a" value="manageClassifiers" /> + <input type="hidden" name="arg" value="editclassifier" /> + <input type="hidden" name="update" value="update" /> + <input type="hidden" name="class_label" + value="<?php e($data['class_label']) ?>" /> + <div class="top-margin"> + <label for="rename-label"><?php + e(tl('editclassifier_classifier_label')) ?></label> + <input type="text" id="rename-label" name="rename_label" + value="<?php e($data['class_label']) ?>" + maxlength="80" class="wide-field"/> + <button class="button-box" type="submit"><?php + e(tl('editclassifier_change')); ?></button> + </div> + </form> + <h3><?php e(tl('editclassifier_statistics')) ?></h3> + <p><b><?php e(tl('editclassifier_positive_examples')) + ?></b> <span id="positive-count"><?php + e($classifier->positive) ?></span></p> + <p><b><?php e(tl('editclassifier_negative_examples')) + ?></b> <span id="negative-count"><?php + e($classifier->negative) ?></span></p> + <p><b><?php e(tl('editclassifier_accuracy')) + ?></b> <span id="accuracy"><?php + if (!is_null($classifier->accuracy)) { + printf('%.1f%%', $classifier->accuracy * 100); + } else { + e(tl('editclassifier_na')); + }?></span> + [<a id="update-accuracy" href="#update-accuracy" + <?php if ($classifier->total < 10) { + e('class="disabled"'); + } ?>><?php e(tl('editclassifier_update')) ?></a>]</p> + <h3><?php e(tl('editclassifier_add_examples')) ?></h3> + <form id="label-docs-form" action="" method="GET"> + <table> + <tr> + <th><?php e(tl('editclassifier_source')) ?></th> + <td> + <select id="label-docs-source" name="label_docs_source"> + <option value="1" selected="selected"><?php + e(tl('editclassifier_default_crawl')) ?></option> + <?php foreach ($data['CRAWLS'] as $crawl) { ?> + <option value="<?php e($crawl['CRAWL_TIME']) ?>"><?php + e($crawl['DESCRIPTION']) ?></option> + <?php } ?> + </select> + </td> + <td> + <select id="label-docs-type" name="label_docs_type"> + <option value="manual" selected="selected"><?php + e(tl('editclassifier_label_by_hand')) ?></option> + <option value="positive"><?php + e(tl('editclassifier_all_in_class')) ?></option> + <option value="negative"><?php + e(tl('editclassifier_none_in_class')) ?></option> + </select> + </td> + </tr> + <tr> + <th><?php e(tl('editclassifier_keywords')) ?></th> + <td colspan="2"> + <input type="text" maxlength="80" id="label-docs-keywords" + name="label_docs_keywords" /> + <button class="button-box" type="submit"><?php + e(tl('editclassifier_load')) ?></button> + </td> + </tr> + <tr> + <th> </th> + <td id="label-docs-status" colspan="2"><?php + e(tl('editclassifier_no_documents')) ?></td> + </tr> + </table> + </form> + <?php + } +} +?> diff --git a/views/elements/manageclassifiers_element.php b/views/elements/manageclassifiers_element.php new file mode 100644 index 000000000..1c6575fde --- /dev/null +++ b/views/elements/manageclassifiers_element.php @@ -0,0 +1,125 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage element + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2013 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + +/** + * This element renders the page that lists classifiers, provides a form to + * create new ones, and provides per-classifier action links to edit, finalize, + * and delete the associated classifier. + * + * @author Shawn Tice + * @package seek_quarry + * @subpackage element + */ +class ManageclassifiersElement extends Element +{ + /** + * Draws the "new classifier" form and table of existing classifiesr + * + * @param array $data used to pass the list of existing classifier + * instances + */ + function render($data) + { + $base_url = "?c=admin&a=manageClassifiers&".CSRF_TOKEN."=". + $data[CSRF_TOKEN]."&arg="; + ?> + <div class="current-activity"> + <h2><?php e(tl('manageclassifiers_manage_classifiers')) ?></h2> + <form id="classifiersForm" method="get" action=''> + <input type="hidden" name="c" value="admin" /> + <input type="hidden" name="<?php e(CSRF_TOKEN); ?>" value="<?php + e($data[CSRF_TOKEN]); ?>" /> + <input type="hidden" name="a" value="manageClassifiers" /> + <input type="hidden" name="arg" value="createclassifier" /> + <div class="top-margin"><label for="class-label"><?php + e(tl('manageclassifiers_classifier_name')) ?></label> + <input type="text" id="class-label" name="class_label" + value="" maxlength="80" + class="wide-field"/> + <button class="button-box" type="submit"><?php + e(tl('manageclassifiers_create_button')) ?></button> + </div> + </form> + <?php if (!empty($data['classifiers'])) { ?> + <h3><?php e(tl('manageclassifiers_available_classifiers')) ?></h3> + <table class="classifiers-table"> + <tr> + <th><?php e(tl('manageclassifiers_label_col')) ?></th> + <th><?php e(tl('manageclassifiers_positive_col')) ?></th> + <th><?php e(tl('manageclassifiers_negative_col')) ?></th> + <th colspan="3"><?php + e(tl('manageclassifiers_actions_col')) ?></th> + </tr> + <?php foreach ($data['classifiers'] as $label => $classifier) { ?> + <tr> + <td><b><?php e($label) ?></b><br /> + <small><?php e(date("d M Y H:i:s", + $classifier->timestamp)) ?></small> + </td> + <td><?php e($classifier->positive) ?></td> + <td><?php e($classifier->negative) ?></td> + <td><a href="<?php e($base_url) + ?>editclassifier&class_label=<?php + e($label) ?>"><?php + e(tl('manageclassifiers_edit')) ?></a></td> + <td><?php + if ($classifier->finalized == Classifier::FINALIZED) { + e(tl('manageclassifiers_finalized')); + } else if ($classifier->finalized == Classifier::UNFINALIZED) { + if ($classifier->total > 0) { + ?><a href="<?php e($base_url) + ?>finalizeclassifier&class_label=<?php + e($label) ?>"><?php + e(tl('manageclassifiers_finalize')) ?></a><?php + } else { + e(tl('manageclassifiers_finalize')); + } + } else if ($classifier->finalized == Classifier::FINALIZING) { + e(tl('manageclassifiers_finalizing')); + } + ?></td> + <td><a href="<?php e($base_url) + ?>deleteclassifier&class_label=<?php + e($label) ?>"><?php + e(tl('manageclassifiers_delete')) ?></a></td> + </tr> + <?php } // end foreach over classifiers ?> + </table> + <?php } // endif for available classifiers ?> + </div> + <?php + } +} +?> diff --git a/views/elements/pageoptions_element.php b/views/elements/pageoptions_element.php index 1288a7ce2..ae5101eaf 100644 --- a/views/elements/pageoptions_element.php +++ b/views/elements/pageoptions_element.php @@ -122,10 +122,12 @@ class PageOptionsElement extends Element ?><td><table class="file-types-table" ><?php } ?> - <tr><td><label for="<?php e($filetype); ?>-id"><?php + <tr><td><label for="filetype-<?php e($filetype); ?>-id"><?php e($filetype); ?> </label></td><td><input type="checkbox" <?php e($checked) ?> - name="filetype[<?php e($filetype); ?>]" value="true" /></td> + name="filetype[<?php e($filetype); ?>]" + id="filetype-<?php e($filetype); ?>-id" + value="true" /></td> </tr> <?php $cnt++; @@ -139,9 +141,47 @@ class PageOptionsElement extends Element } ?> </tr></table> - <div class="top-margin"><b><?php - e(tl("pageoptions_element_indexing_plugins"));?></b></div> - <?php if(isset($data['INDEXING_PLUGINS']) && + <div class="top-margin"><b><?php + e(tl('pageoptions_element_classifiers_to_apply')) ?></b> + </div> + <?php if (!empty($data['CLASSIFIERS'])) { ?> + <table class="classifiers-all"><tr> + <?php $cnt = 0; + $num_per_column = count($data['CLASSIFIERS']); + if ($num_per_column > 5) { + $num_per_column = ceil($num_per_column / 3); + } + foreach ($data['CLASSIFIERS'] as $label => $checked) { + if ($cnt % $num_per_column == 0) { + ?><td><table class="classifiers-table" ><?php + } + ?> + <tr><td><label for="classifier-<?php e($label); ?>-id"><?php + e($label); ?> + </label></td><td><input type="checkbox" <?php e($checked) ?> + name="classifier[<?php e($label); ?>]" + id="classifier-<?php e($label) ?>-id" value="true" /></td> + </tr> + <?php + $cnt++; + if($cnt % $num_per_column == 0) { + ?></table></td><?php + } + }?> + <?php + if($cnt % $num_per_column != 0) { + ?></table></td><?php + } + ?> + </tr></table> + <?php + } else { + e("<p class='red'>". + tl('pageoptions_element_no_classifiers').'</p>'); + } ?> + <div class="top-margin"><b><?php + e(tl("pageoptions_element_indexing_plugins"));?></b></div> + <?php if(isset($data['INDEXING_PLUGINS']) && count($data['INDEXING_PLUGINS']) > 0) { ?> <table class="indexing-plugin-table"> <tr><th><?php e(tl('pageoptions_element_plugin'));