Add Classification to Yioop a=shawn

Shawn Tice [2013-05-09 18:May:th]
Add Classification to Yioop a=shawn

This commit adds a new set of library files, major modifications to the admin
controller, a new classifiers controller, two new bin tools for training a
classifier via the web interface and testing classifiers, a new activity tab,
and localizations.

Signed-off-by: Chris Pollett <chris@pollett.org>
Filename
bin/arc_tool.php
bin/classifier_tool.php
bin/classifier_trainer.php
bin/fetcher.php
configs/createdb.php
controllers/admin_controller.php
controllers/classifier_controller.php
controllers/fetch_controller.php
controllers/resource_controller.php
css/search.css
data/default.db
index.php
lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
lib/classifiers/classifier.php
lib/classifiers/classifier_algorithm.php
lib/classifiers/feature_selection.php
lib/classifiers/features.php
lib/classifiers/lasso_regression.php
lib/classifiers/naive_bayes.php
lib/crawl_constants.php
lib/upgrade_functions.php
locale/en-US/configure.ini
models/crawl_model.php
models/parallel_model.php
models/phrase_model.php
models/profile_model.php
scripts/classifiers.js
views/admin_view.php
views/elements/editclassifier_element.php
views/elements/manageclassifiers_element.php
views/elements/pageoptions_element.php
diff --git a/bin/arc_tool.php b/bin/arc_tool.php
index 6a7cbaab8..30ac97392 100755
--- a/bin/arc_tool.php
+++ b/bin/arc_tool.php
@@ -149,7 +149,7 @@ class ArcTool implements CrawlConstants
             if($path == $argv[2] && !file_exists($path)) {
                 $path = CRAWL_DIR."/cache/".$path;
                 if(!file_exists($path)) {
-                    $path = CRAWL_DIR."/cache/archives/".$argv[2];
+                    $path = CRAWL_DIR."/archives/".$argv[2];
                 }
             }
         }
@@ -223,7 +223,7 @@ class ArcTool implements CrawlConstants
             }
         }

-        $nonyioop_pattern = CRAWL_DIR."/cache/archives/*/arc_description.ini";
+        $nonyioop_pattern = CRAWL_DIR."/archives/*/arc_description.ini";
         $archives = glob($nonyioop_pattern);
         if(is_array($archives) && count($archives) > 0 ) {
             $archives_found = true;
@@ -481,7 +481,7 @@ class ArcTool implements CrawlConstants
      */
     function getArchiveName($archive_path)
     {
-        $start = CRAWL_DIR."/cache/archives/";
+        $start = CRAWL_DIR."/archives/";
         if(strstr($archive_path, $start)) {
             $start_len = strlen($start);
             $name = substr($archive_path, $start_len);
@@ -846,7 +846,7 @@ php arc_tool.php info bundle_name

 php arc_tool.php list
     /* returns a list of all the archives in the Yioop! crawl directory,
-       including non-Yioop! archives in the cache/archives sub-folder.*/
+       including non-Yioop! archives in the /archives sub-folder.*/

 php arc_tool.php mergetiers bundle_name max_tier
     // merges tiers of word dictionary into one tier up to max_tier
diff --git a/bin/classifier_tool.php b/bin/classifier_tool.php
new file mode 100755
index 000000000..73d1dbe92
--- /dev/null
+++ b/bin/classifier_tool.php
@@ -0,0 +1,739 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage bin
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+
+/**
+ * Calculate base directory of script
+ * @ignore
+ */
+define("BASE_DIR", substr(
+    dirname(realpath($_SERVER['PHP_SELF'])), 0,
+    -strlen("/bin")));
+
+/** Load in global configuration settings */
+require_once BASE_DIR.'/configs/config.php';
+if(!PROFILE) {
+    echo "Please configure the search engine instance by visiting" .
+        "its web interface on localhost.\n";
+    exit();
+}
+
+/**
+ * CRAWLING means don't try to use memcache
+ * @ignore
+ */
+define("NO_CACHE", true);
+
+/**
+ * Immediately throw an exception for all notices and warnings, rather than
+ * letting execution continue.
+ * @ignore
+ */
+function handleError($errno, $err_str, $err_file, $err_line)
+{
+    if (error_reporting() == 0) {
+        // Error suppressed by @, so ignore.
+        return;
+    }
+    $msg = "$err_str in $err_file on line $err_line";
+    if ($errno == E_NOTICE || $errno == E_WARNING) {
+        throw new ErrorException($msg, $errno);
+    } else {
+        echo $msg;
+    }
+}
+set_error_handler('handleError');
+
+/** To use and manipulate classifiers */
+require_once BASE_DIR."/lib/classifiers/classifier.php";
+/** To manipulate crawl mixes using the controller's methods */
+require_once BASE_DIR."/controllers/classifier_controller.php";
+
+/*
+ *  We'll set up multi-byte string handling to use UTF-8
+ */
+mb_internal_encoding("UTF-8");
+mb_regex_encoding("UTF-8");
+
+/**
+ * This class is used to automate the building and testing of classifiers,
+ * providing an alternative to the web interface when a labeled training set is
+ * available.
+ *
+ * ClassifierTool takes an activity to perform, the name of a dataset to use,
+ * and a label for the constructed classifier. The activity is the name of one
+ * of the 'run*' functions implemented by this class, without the common 'run'
+ * prefix (e.g., 'TrainAndTest'). The dataset is specified as the common prefix
+ * of two indexes that have the suffixes "Pos" and "Neg", respectively.  So if
+ * the prefix were "DATASET", then this tool would look for the two existing
+ * indexes "DATASET Pos" and "DATASET Neg" from which to draw positive and
+ * negative examples. Each document in these indexes should be a positive or
+ * negative example of the target class, according to whether it's in the "Pos"
+ * or "Neg" index. Finally, the label is just the label to be used for the
+ * constructed classifier.
+ *
+ * Beyond these options (set with the -a, -d, and -l flags), a number of other
+ * options may be set to alter parameters used by an activity or a classifier.
+ * These options are set using the -S, -I, -F, and -B flags, which correspond
+ * to string, integer, float, and boolean parameters respectively. These flags
+ * may be used repeatedly, and each expects an argument of the form NAME=VALUE,
+ * where NAME is the name of a parameter, and VALUE is a value parsed according
+ * to the flag. The NAME should match one of the keys of the options member of
+ * this class, where a period ('.') may be used to specify nesting.  For
+ * example:
+ *
+ *    -I debug=1         # set the debug level to 1
+ *    -B cls.use_nb=0    # tell the classifier to use Naive Bayes
+ *
+ * To build and evaluate a classifier for the label 'spam', trained using the
+ * two indexes "DATASET Neg" and "DATASET Pos", and a maximum of the top 25
+ * most informative features:
+ *
+ * php bin/classifier_tool.php -a TrainAndTest -d 'DATASET' -l 'spam'
+ *    -I cls.chi2.max=25
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ */
+class ClassifierTool
+{
+    /**
+     * Reference to a classifier controller, used to manipulate crawl mixes in
+     * the same way that the controller that handles web requests does.
+     * @var object
+     */
+    var $classifierController;
+
+    /**
+     * Reference to a crawl model object, also used to manipulate crawl mixes.
+     * @var object
+     */
+    var $crawlModel;
+
+    /**
+     * Options to be used by activities and constructed classifiers. These
+     * options can be overridden by supplying an appropriate flag on the
+     * command line, where nesting is denoted by a period (e.g., cls.chi2.max).
+     * The supported options are:
+     *
+     *    debug: An integer, the level of debug statements to print. Larger
+     *        integers specify more detailed debug output; the default value of
+     *        0 indicates no debug output.
+     *
+     *    max_train: An integer, the maximum number of examples to use when
+     *        training a classifier. The default value of NULL indicates that
+     *        all available training examples should be used.
+     *
+     *    test_interval: An integer, the number of new training examples to be
+     *        added before a round of testing on ALL test instances is to be
+     *        executed. With an interval of 5, for example, after adding five
+     *        new training examples, the classifier would be finalized and used
+     *        to classify all test instances. The error is reported for each
+     *        round of testing. The default value of NULL indicates that
+     *        testing should only occur after all training examples have been
+     *        added.
+     *
+     *    split: An integer, the number of examples from the entire set of
+     *        labeled examples to use for training. The remainder are used for
+     *        testing.
+     *
+     *    cls.use_nb: A boolean, whether or not to use the Naive Bayes
+     *        classification algorithm instead of the logistic regression one
+     *        in order to finalize the classifier.  The default value is false,
+     *        indicating that logistic regression should be used.
+     *
+     *    cls.chi2.max: An integer, the maximum number of features to use when
+     *        training the classifier.  The default is a relatively
+     *        conservative 200.
+     *
+     * @var array
+     */
+    var $options = array(
+        'debug' => 0,
+        'max_train' => NULL,
+        'test_interval' => NULL,
+        'split' => 3000,
+        'cls' => array(
+            'use_nb' => false,
+            'chi2' => array(
+                'max' => 200)));
+
+    /**
+     * Initializes the classifier controller and crawl model that will be used
+     * to manage crawl mixes, used for iterating over labeled examples.
+     */
+    function __construct()
+    {
+        $this->classifierController = new ClassifierController();
+        $this->crawlModel = $this->classifierController->crawlModel;
+    }
+
+    /**
+     * Parses the command-line options, returns the required arguments, and
+     * updates the member variable $options with any parameters. If any of the
+     * required arguments (activity, dataset, or label) are missing, then a
+     * message is printed and the program exits. The optional arguments used to
+     * set parameters directly modify the class state through the setOptions
+     * method.
+     *
+     * @return array the parsed activity, dataset, and label
+     */
+    function parseOptions()
+    {
+        $shortopts = 'l:a:d:S:I:F:B:';
+        $options = getopt($shortopts);
+        if (!isset($options['a'])) {
+            echo "missing -a flag to choose activity to run\n";
+            exit(1);
+        }
+        if (!isset($options['l'])) {
+            echo "missing -l flag to set classifier label\n";
+            exit(1);
+        }
+        if (!isset($options['d'])) {
+            echo "missing -d flag to choose dataset to use\n";
+            exit(1);
+        }
+        $activity = $options['a'];
+        $label = Classifier::cleanLabel($options['l']);
+        $dataset_name = $options['d'];
+        unset($options['a'], $options['l'], $options['d']);
+        foreach ($options as $opt_name => $value) {
+            switch ($opt_name) {
+            case 'S':
+                $this->setOptions($value);
+                break;
+            case 'I':
+                $this->setOptions($value, 'intval');
+                break;
+            case 'F':
+                $this->setOptions($value, 'floatval');
+                break;
+            case 'B':
+                $this->setOptions($value, 'boolval');
+                break;
+            default:
+                echo "unsupported option: {$opt_name}\n";
+                break;
+            }
+        }
+        return array($activity, $dataset_name, $label);
+    }
+
+    /**
+     * Parses the options, and if an appropriate activity exists, calls the
+     * activity, passing in the label and dataset to be used; otherwise, prints
+     * an error and exits.
+     */
+    function main()
+    {
+        list($activity, $dataset_name, $label) = $this->parseOptions();
+        $method = "run{$activity}";
+        if (method_exists($this, $method)) {
+            $this->$method($label, $dataset_name);
+        } else {
+            echo "no activity: {$activity}\n";
+            exit(1);
+        }
+    }
+
+    /* ACTIVITIES */
+
+    /**
+     * Trains a classifier on a data set, testing at the specified intervals.
+     * The testing interval is set by the test_interval parameter. Each time
+     * this activity is run a new classifier is created (replacing an old one
+     * with the same label, if necessary), and the classifier remains at the
+     * end.
+     *
+     * @param string $label class label of the new classifier
+     * @param string $dataset_name name of the dataset to train and test on
+     */
+    function runTrainAndTest($label, $dataset_name)
+    {
+        $this->setDefault('max_train', 200);
+        $this->logOptions();
+        $classifier = $this->makeFreshClassifier($label);
+        $data = $this->loadDataset($dataset_name, $label);
+        $classifier->initBuffer($data['train'], 0);
+        $pages = $data['train'];
+        $classifier->prepareToLabel();
+        $end = min($this->options['max_train'], $pages->length);
+        for ($i = 1; $i <= $end; $i++) {
+            $page = $pages->nextPage();
+            $doc_label = $page['TRUE_LABEL'];
+            $key = Classifier::makeKey($page);
+            $classifier->addBufferDoc($page, false);
+            $classifier->labelDocument($key, $doc_label, false);
+            if ($this->isTestPoint($i, $end)) {
+                Classifier::setClassifier($classifier);
+                $this->testClassifier($classifier, $data);
+                /*
+                   Testing the classifier puts it into "classify" mode, which
+                   will uses a different set of data from "label" mode, so it's
+                   important to switch back.
+                */
+                $classifier->prepareToLabel();
+            }
+        }
+    }
+
+    /**
+     * Like the TrainAndTest activity, but uses active training in order to
+     * choose the documents to add to the training set. The method simulates
+     * the process that an actual user would go through in order to label
+     * documents for addition to the training set, then tests performance at
+     * the specified intervals.
+     *
+     * @param string $label class label of the new classifier
+     * @param string $dataset_name name of the dataset to train and test on
+     */
+    function runActiveTrainAndTest($label, $dataset_name)
+    {
+        $this->setDefault('max_train', 200);
+        $this->logOptions();
+        $classifier = $this->makeFreshClassifier($label);
+        $data = $this->loadDataset($dataset_name, $label);
+        $pages = $data['train'];
+        $classifier->prepareToLabel();
+        $classifier->initBuffer($pages);
+        $end = min($this->options['max_train'], $pages->length);
+        for ($i = 1; $i <= $end; $i++) {
+            list($new_doc, $disagreement) =
+                $classifier->findNextDocumentToLabel();
+            if ($new_doc) {
+                $key = Classifier::makeKey($new_doc);
+                $doc_label = $new_doc['TRUE_LABEL'];
+                $classifier->labelDocument($key, $doc_label);
+                $classifier->refreshBuffer($pages);
+                $classifier->computeBufferDensities();
+                $classifier->train();
+            }
+            if ($this->isTestPoint($i, $end)) {
+                Classifier::setClassifier($classifier);
+                $this->testClassifier($classifier, $data);
+                $classifier->prepareToLabel();
+            }
+        }
+    }
+
+    /* UTILITY METHODS */
+
+    /**
+     * Creates a new classifier for a label, first deleting any existing
+     * classifier with the same label.
+     *
+     * @param string $label class label of the new classifier
+     * @return object created classifier instance
+     */
+    function makeFreshClassifier($label)
+    {
+        if ($classifier = Classifier::getClassifier($label)) {
+            $this->deleteClassifier($label);
+        }
+        $classifier = new Classifier($label, $this->options['cls']);
+        Classifier::setClassifier($classifier);
+        return $classifier;
+    }
+
+    /**
+     * Deletes an existing classifier, specified by its label.
+     *
+     * @param string $label class label of the existing classifier
+     */
+    function deleteClassifier($label)
+    {
+        Classifier::deleteClassifier($label);
+        $mix_name = Classifier::getCrawlMixName($label);
+        $mix_time = $this->crawlModel->getCrawlMixTimestamp($mix_name);
+        if ($mix_time) {
+            $this->crawlModel->deleteCrawlMixIteratorState($mix_time);
+            $this->crawlModel->deleteCrawlMix($mix_time);
+        }
+    }
+
+    /**
+     * Fetches the summaries for pages in the indices specified by the passed
+     * dataset name. This method looks for existing indexes with names matching
+     * the dataset name prefix, and with suffix either "pos" or "neg" (ignoring
+     * case). The pages in these indexes are shuffled into one large array, and
+     * augmented with a TRUE_LABEL field that records which set they came from
+     * originally. The shuffled array is then split according to the `split'
+     * option, and all pages up to (but not including) the split index are used
+     * for the training set; the remaining pages are used for the test set.
+     *
+     * @param string $dataset_name prefix of index names to draw examples from
+     * @param string $class_label class label of the classifier the examples
+     *  will be used to train (used to name the crawl mix that iterates over
+     *  each index)
+     * @return array training and test datasets in an associative array with
+     *  keys `train' and `test', where each dataset is wrapped up in a
+     *  PageIterator that implements the CrawlMixIterator interface.
+     */
+    function loadDataset($dataset_name, $class_label)
+    {
+        $crawls = $this->crawlModel->getCrawlList(false, true, NULL);
+        $dataset_name = preg_quote($dataset_name);
+        $re = '/^RECRAWL::'.$dataset_name.' (pos|neg)$/i';
+        $pages = array();
+        foreach ($crawls as $crawl) {
+            if (!preg_match($re, $crawl['DESCRIPTION'], $groups)) {
+                continue;
+            }
+            $label = strtolower($groups[1]);
+            $doc_label = $label == 'pos' ? 1 : -1;
+            $mix_iterator =
+                $this->classifierController->buildClassifierCrawlMix(
+                    $class_label, $crawl['CRAWL_TIME']);
+            while (!$mix_iterator->end_of_iterator) {
+                $new_pages = $mix_iterator->nextPages(5000);
+                /*
+                   This field can be added to the results from a crawl mix
+                   iterator, but we don't care about it, so we just discard it.
+                */
+                if (isset($new_pages['NO_PROCESS'])) {
+                    unset($new_pages['NO_PROCESS']);
+                }
+                foreach ($new_pages as $page) {
+                    $page['TRUE_LABEL'] = $doc_label;
+                    $pages[] = $page;
+                }
+            }
+        }
+        shuffle($pages);
+        if (count($pages) < $this->options['split']) {
+            echo "split is larger than dataset\n";
+            exit(1);
+        }
+        $data = array();
+        $data['train'] = new PageIterator(
+            array_slice($pages, 0, $this->options['split']));
+        $data['test'] = new PageIterator(
+            array_slice($pages, $this->options['split']));
+        return $data;
+    }
+
+    /**
+     * Determines whether to run a classification test after a certain number
+     * of documents have been added to the training set. Whether or not to test
+     * is determined by the `test_interval' option, which may be either NULL,
+     * an integer, or a string. In the first case, testing only occurs after
+     * all training examples have been added; in the second case, testing
+     * occurs each time an additional constant number of training examples have
+     * been added; and in the final case, testing occurs on a fixed schedule of
+     * comma-separated offsets, such as "10,25,50,100".
+     *
+     * @param int $i the size of the current training set
+     * @param int $total the total number of documents available to be added to
+     *  the training set
+     * @return bool true if the `test_interval' option specifies that a round
+     *  of testing should occur for the current training offset, and false
+     *  otherwise
+     */
+    function isTestPoint($i, $total)
+    {
+        if (is_null($this->options['test_interval'])) {
+            return $i == $total;
+        } else if (is_int($this->options['test_interval'])) {
+            return $i % $this->options['test_interval'] == 0;
+        } else {
+            $re = '/(^|,)'.$i.'(,|$)/';
+            return preg_match($re, $this->options['test_interval']);
+        }
+    }
+
+    /**
+     * Finalizes the current classifier, uses it to classify all test
+     * documents, and logs the classification error.  The current classifier is
+     * saved to disk after finalizing (though not before), and left in
+     * `classify' mode. The iterator over the test dataset is reset for the
+     * next round of testing (if any).
+     *
+     * @param object $classifier classifier instance to test
+     * @param array $data the array of training and test datasets, constructed
+     *  by loadDataset, of which only the `test' dataset it used.
+     */
+    function testClassifier($classifier, $data)
+    {
+        $classifier->prepareToFinalize();
+        $classifier->finalize();
+        Classifier::setClassifier($classifier);
+        $classifier->prepareToClassify();
+        $wrong = 0;
+        $total = 0;
+        $pages = $data['test'];
+        while (!$pages->end_of_iterator) {
+            $page = $pages->nextPage();
+            $score = $classifier->classify($page);
+            $page_label = $score >= 0.5 ? 1 : -1;
+            if ($page_label != $page['TRUE_LABEL']) {
+                $wrong++;
+            }
+            $total++;
+        }
+        $error = (float)$wrong / $total;
+        $this->log(0, 'error = %.4f', $error);
+        $pages->reset();
+    }
+
+    /**
+     * Writes out logging information according to a detail level. The first
+     * argument is an integer (potentially negative) indicating the level of
+     * detail for the log message, where larger numbers indicate greater
+     * detail. Each message is prefixed with a character according to its level
+     * of detail, but if the detail level is greater than the level specified
+     * by the `debug' option then nothing is printed. The treatment for the
+     * available detail levels are as follows:
+     *
+     *     -2: Used for errors; always printed; prefix '! '
+     *     -1: Used for log of set options; always printed; prefix '# '
+     *     0+: Used for normal messages; prefix '> '
+     *
+     * The second argument is a printf-style string template specifying the
+     * message, and each following (optional) argument is used by the template.
+     * A newline is added automatically to each message.
+     *
+     * @param int $level level of detail for the message
+     * @param string $message printf-style template for the message
+     * @param string $args,... optional arguments to be used for the message
+     *  template
+     */
+    function log(/* varargs */)
+    {
+        $args = func_get_args();
+        $level = array_shift($args);
+        if ($level > $this->options['debug']) {
+            return;
+        }
+        if ($level == -2) {
+            echo '! ';
+        } else if ($level == -1) {
+            echo '# ';
+        } else {
+            echo '> ';
+        }
+        call_user_func_array('printf', $args);
+        echo "\n";
+    }
+
+    /**
+     * Logs the current options using the log method of this class. This method
+     * is used to explicitly state which settings were used for a given run of
+     * an activity. The detail level passed to the log method is -1.
+     */
+    function logOptions($root = NULL, $prefix = '')
+    {
+        if (is_null($root)) {
+            $root = $this->options;
+        }
+        foreach ($root as $key => $value) {
+            if (is_array($value)) {
+                $this->logOptions($value, $prefix.$key.'.');
+            } else if (!is_null($value)) {
+                if ($value === false) $value = 'false';
+                else if ($value === true) $value = 'true';
+                $this->log(-1, '%s%s = %s', $prefix, $key, strval($value));
+            }
+        }
+    }
+
+    /**
+     * Sets one or more options of the form NAME=VALUE according to a converter
+     * such as intval, floatval, and so on. The options may be passed in either
+     * as a string (a single option) or as an array of strings, where each
+     * string corresponds to an option of the same type (e.g., int).
+     *
+     * @param string|array $opts single option in the format NAME=VALUE, or
+     *  array of options, each for the same target type (e.g., int)
+     * @param string $converter the name of a function that takes a string and
+     *  casts it to a particular type (e.g., intval, floatval)
+     */
+    function setOptions($opts, $converter = NULL)
+    {
+        if (!is_array($opts)) {
+            $opts = array($opts);
+        }
+        foreach ($opts as $opt) {
+            $split = strpos($opt, '=');
+            $name = substr($opt, 0, $split);
+            $value = substr($opt, $split + 1);
+            if ($converter) {
+                if ($converter == 'boolval' && !function_exists('boolval')) {
+                    $value = (bool)$value;
+                } else {
+                    $value = call_user_func($converter, $value);
+                }
+            }
+            $fields = explode('.', $name);
+            $field =& $this->options;
+            while (!empty($fields)) {
+                $top = array_shift($fields);
+                if (array_key_exists($top, $field)) {
+                    $field =& $field[$top];
+                } else {
+                    $this->log(-2, 'unknown option: "%s"', $name);
+                    break;
+                }
+            }
+            if (empty($fields)) {
+                $field = $value;
+            }
+        }
+    }
+
+    /**
+     * Sets a default value for a runtime parameter. This method is used by
+     * activities to specify default values that may be overridden by passing
+     * the appropriate command-line flag.
+     */
+    function setDefault($name, $value)
+    {
+        $fields = explode('.', $name);
+        $field =& $this->options;
+        while (count($fields) > 1) {
+            $top = array_shift($fields);
+            $field =& $field[$top];
+        }
+        $last = array_shift($fields);
+        if (!isset($field[$last])) {
+            $field[$last] = $value;
+        }
+    }
+}
+
+
+/**
+ * This class provides the same interface as an iterator over crawl mixes, but
+ * simply iterates over an array.
+ *
+ * This is used to gather all of the pages for a training set in one go (using
+ * a crawl mix iterator), then repeatedly iterate over them in memory, as
+ * though they were coming from the original crawl mix iterator.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ */
+class PageIterator
+{
+    /**
+     * The array of pages to repeatedly iterate over.
+     * @var array
+     */
+    var $pages;
+
+    /**
+     * The total number of pages.
+     * @var int
+     */
+    var $length;
+
+    /**
+     * The current offset into the wrapped array.
+     * @var int
+     */
+    var $pos;
+
+    /**
+     * Whether or not the last page has been reached.
+     * @var bool
+     */
+    var $end_of_iterator;
+
+    /**
+     * Establishes a new iterator over a (potentially empty) array of pages.
+     *
+     * @param array $pages standard array of pages to iterate over
+     */
+    function __construct($pages)
+    {
+        $this->pages = $pages;
+        $this->length = count($pages);
+        $this->reset();
+    }
+
+    /**
+     * Resets the iterator so that the next page will be the first.
+     */
+    function reset()
+    {
+        $this->pos = 0;
+        $this->end_of_iterator = $this->length == 0;
+    }
+
+    /**
+     * Returns up to the requested number of next pages, potentially an empty
+     * array if there are no pages left. This method updates the
+     * `end_of_iterator' flag according to whether the last page has been
+     * returned.
+     *
+     * @param int $n maximum number of pages to return, or -1 to return all
+     *  remaining pages
+     * @return array next $n pages, or less if there are fewer than $n
+     *  pages remaining
+     */
+    function nextPages($n = -1)
+    {
+        if ($n == -1) {
+            $n = $this->length - $this->pos;
+        } else {
+            $n = min($this->length - $this->pos, $n);
+        }
+        $start = $this->pos;
+        $this->pos += $n;
+        if ($this->pos == $this->length) {
+            $this->end_of_iterator = true;
+        }
+        return array_slice($this->pages, $start, $n);
+    }
+
+    /**
+     * Behaves like nextPages, but returns just the next page (not wrapped in
+     * an array) if there is one, and NULL otherwise.
+     *
+     * @return array next page if available, and NULL otherwise
+     */
+    function nextPage()
+    {
+        $next = $this->nextPages(1);
+        return !empty($next) ? $next[0] : NULL;
+    }
+}
+
+try {
+    $classifier_tool = new ClassifierTool();
+    $classifier_tool->main();
+} catch (ErrorException $e) {
+    echo $e . "\n";
+}
+?>
\ No newline at end of file
diff --git a/bin/classifier_trainer.php b/bin/classifier_trainer.php
new file mode 100755
index 000000000..4e5b170b4
--- /dev/null
+++ b/bin/classifier_trainer.php
@@ -0,0 +1,125 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage bin
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+
+if(php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
+
+/*
+   Calculate base directory of script
+ */
+define("BASE_DIR", substr(
+    dirname(realpath($_SERVER['PHP_SELF'])), 0,
+    -strlen("/bin")));
+
+/*
+   We must specify that we want logging enabled
+ */
+define("NO_LOGGING", false);
+
+/*
+   Load in global configuration settings
+ */
+require_once BASE_DIR.'/configs/config.php';
+if(!PROFILE) {
+    echo "Please configure the search engine instance by visiting" .
+        "its web interface on localhost.\n";
+    exit();
+}
+
+/** Used to initialize and terminate the daemon */
+require_once BASE_DIR."/lib/crawl_daemon.php";
+/** Used to create, update, and delete user-trained classifiers. */
+require_once BASE_DIR."/lib/classifiers/classifier.php";
+
+/*
+    We'll set up multi-byte string handling to use UTF-8
+ */
+mb_internal_encoding("UTF-8");
+mb_regex_encoding("UTF-8");
+
+
+/*
+   If possible, set the memory limit high enough to fit all of the features and
+   training documents into memory.
+ */
+ini_set("memory_limit", "500M");
+
+
+/**
+ * This class is used to finalize a classifier via the web interface.
+ *
+ * Because finalizing involves training a logistic regression classifier on a
+ * potentially-large set of training examples, it can take much longer than
+ * would be allowed by the normal web execution time limit. So instead of
+ * trying to finalize a classifier directly in the controller that handles the
+ * web request, the controller kicks off a daemon that simply loads the
+ * classifier, finalizes it, and saves it back to disk.
+ *
+ * The classifier to finalize is specified by its class label, passed as the
+ * second command-line argument. The following command would be used to run
+ * this script directly from the command-line:
+ *
+ *     $ php bin/classifier_trainer.php terminal LABEL
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ */
+class ClassifierTrainer
+{
+    /**
+     *  This is the function that should be called to get the
+     *  classifier_trainer to start training a logistic regression instance for
+     *  a particular classifier. The class label corresponding to the
+     *  classifier to be finalized should be passed as the second command-line
+     *  argument.
+     */
+    function start()
+    {
+        global $argv;
+        CrawlDaemon::init($argv, "classifier_trainer");
+        $label = $argv[2];
+        crawlLog("Initializing classifier trainer log..",
+            $label.'-classifier_trainer');
+        $classifier = Classifier::getClassifier($label);
+        $classifier->prepareToFinalize();
+        $classifier->finalize();
+        Classifier::setClassifier($classifier);
+        crawlLog("Training complete.\n");
+        CrawlDaemon::stop('classifier_trainer', $label);
+    }
+}
+
+$classifier_trainer = new ClassifierTrainer();
+$classifier_trainer->start();
+
+?>
\ No newline at end of file
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 0dcfddc8d..a616603db 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -90,6 +90,8 @@ require_once BASE_DIR."/lib/url_parser.php";
 require_once BASE_DIR."/lib/phrase_parser.php";
 /** For user-defined processing on page summaries*/
 require_once BASE_DIR."/lib/page_rule_parser.php";
+/** For user-trained classification of page summaries*/
+require_once BASE_DIR."/lib/classifiers/classifier.php";
 /** for crawlHash and crawlLog */
 require_once BASE_DIR."/lib/utility.php";
 /** for crawlDaemon function */
@@ -1109,6 +1111,8 @@ class Fetcher implements CrawlConstants
             } else {
                 $info[self::ARC_DATA] = $pages;
             }
+        } else if(isset($info['ARCHIVE_BUNDLE_ERROR'])) {
+            crawlLog("  ".$info['ARCHIVE_BUNDLE_ERROR']);
         }

         crawlLog("Time to fetch archive data from name server ".
@@ -1193,7 +1197,26 @@ class Fetcher implements CrawlConstants
                 $this->$field = $info[$info_field];
             }
         }
-
+        if(!empty($info[self::ACTIVE_CLASSIFIERS_DATA])){
+            /*
+               The classifier data is set by the fetch controller for each
+               active classifier, and is a compressed, serialized structure
+               containing all of the objects needed for classification.
+             */
+            $classifiers_data = $info[self::ACTIVE_CLASSIFIERS_DATA];
+            $this->classifiers = array();
+            foreach ($classifiers_data as $label => $classifier_data) {
+                if ($classifier_data) {
+                    $classifier = Classifier::newClassifierFromData(
+                        $classifier_data);
+                    $this->classifiers[] = $classifier;
+                    crawlLog("Classifying with '{$label}' classifier.");
+                } else {
+                    crawlLog("Skipping classifier '{$label}'; missing ".
+                        "finalized data.");
+                }
+            }
+        }
         if(isset($info[self::PAGE_RULES]) ){
             $rule_string = implode("\n", $info[self::PAGE_RULES]);
             $rule_string = html_entity_decode($rule_string, ENT_QUOTES);
@@ -1547,6 +1570,17 @@ class Fetcher implements CrawlConstants
                     isset($site[self::ROBOT_PATHS])) {
                     $summarized_site_pages[$i][self::JUST_METAS] = true;
                 }
+                if(isset($site[self::DOC_INFO][self::META_WORDS])) {
+                    if (!isset($summarized_site_pages[$i][self::META_WORDS])) {
+                        $summarized_site_pages[$i][self::META_WORDS] =
+                            $site[self::DOC_INFO][self::META_WORDS];
+                    } else {
+                        $summarized_site_pages[$i][self::META_WORDS] =
+                            array_merge(
+                                $summarized_site_pages[$i][self::META_WORDS],
+                                $site[self::DOC_INFO][self::META_WORDS]);
+                    }
+                }
                 if(isset($site[self::DOC_INFO][self::LANG])) {
                     if($site[self::DOC_INFO][self::LANG] == 'en' &&
                         $site[self::ENCODING] != "UTF-8") {
@@ -1580,6 +1614,10 @@ class Fetcher implements CrawlConstants
                     $this->page_rule_parser->executeRuleTrees(
                         $summarized_site_pages[$i]);
                 }
+                if(!empty($this->classifiers)) {
+                    Classifier::labelPage($summarized_site_pages[$i],
+                        $this->classifiers);
+                }
                 $i++;
             }
         } // end for
diff --git a/configs/createdb.php b/configs/createdb.php
index bff6681bd..3721eb3e1 100755
--- a/configs/createdb.php
+++ b/configs/createdb.php
@@ -80,7 +80,7 @@ if(!$profile_model->createDatabaseTables($db, $dbinfo)) {
     exit();
 }

-$db->execute("INSERT INTO VERSION VALUES (15)");
+$db->execute("INSERT INTO VERSION VALUES (16)");

 //default account is root without a password
 $sql ="INSERT INTO USER VALUES (1, 'root', '".crawlCrypt('')."' ) ";
@@ -112,7 +112,8 @@ $db->execute("INSERT INTO LOCALE VALUES (
     18, 'hi', 'हिन्दी', 'lr-tb')");
 $db->execute("INSERT INTO LOCALE VALUES (19, 'tr', 'Türkçe', 'lr-tb')");
 $db->execute("INSERT INTO LOCALE VALUES (20, 'fa', 'فارسی', 'rl-tb')");
-$db->execute("INSERT INTO LOCALE VALUES (21, 'te', 'తెలుగు', 'lr-tb')");
+$db->execute("INSERT INTO LOCALE VALUES (21, 'te',
+    'తెలుగు', 'lr-tb')");

 $sql ="INSERT INTO ROLE VALUES (1, 'Admin' ) ";
 $db->execute($sql);
@@ -128,31 +129,36 @@ $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 8)");
 $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 9)");
 $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 10)");
 $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 11)");
+$db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 12)");

 $db->execute("INSERT INTO ACTIVITY VALUES (1, 1, 'manageAccount')");
 $db->execute("INSERT INTO ACTIVITY VALUES (2, 2, 'manageUsers')");
 $db->execute("INSERT INTO ACTIVITY VALUES (3, 3, 'manageRoles')");
 $db->execute("INSERT INTO ACTIVITY VALUES (4, 4, 'manageCrawls')");
 $db->execute("INSERT INTO ACTIVITY VALUES (5, 5, 'mixCrawls')");
-$db->execute("INSERT INTO ACTIVITY VALUES (6, 6, 'pageOptions')");
-$db->execute("INSERT INTO ACTIVITY VALUES (7, 7, 'resultsEditor')");
-$db->execute("INSERT INTO ACTIVITY VALUES (8, 8, 'searchSources')");
-$db->execute("INSERT INTO ACTIVITY VALUES (9, 9, 'manageMachines')");
-$db->execute("INSERT INTO ACTIVITY VALUES (10, 10, 'manageLocales')");
-$db->execute("INSERT INTO ACTIVITY VALUES (11, 11, 'configure')");
+$db->execute("INSERT INTO ACTIVITY VALUES (6, 6, 'manageClassifiers')");
+$db->execute("INSERT INTO ACTIVITY VALUES (7, 7, 'pageOptions')");
+$db->execute("INSERT INTO ACTIVITY VALUES (8, 8, 'resultsEditor')");
+$db->execute("INSERT INTO ACTIVITY VALUES (9, 9, 'searchSources')");
+$db->execute("INSERT INTO ACTIVITY VALUES (10, 10, 'manageMachines')");
+$db->execute("INSERT INTO ACTIVITY VALUES (11, 11, 'manageLocales')");
+$db->execute("INSERT INTO ACTIVITY VALUES (12, 12, 'configure')");

 $db->execute("INSERT INTO TRANSLATION VALUES (1,'db_activity_manage_account')");
 $db->execute("INSERT INTO TRANSLATION VALUES (2, 'db_activity_manage_users')");
 $db->execute("INSERT INTO TRANSLATION VALUES (3, 'db_activity_manage_roles')");
 $db->execute("INSERT INTO TRANSLATION VALUES (4, 'db_activity_manage_crawl')");
 $db->execute("INSERT INTO TRANSLATION VALUES (5, 'db_activity_mix_crawls')");
-$db->execute("INSERT INTO TRANSLATION VALUES (6, 'db_activity_file_options')");
-$db->execute("INSERT INTO TRANSLATION VALUES (7,'db_activity_results_editor')");
-$db->execute("INSERT INTO TRANSLATION VALUES(8,'db_activity_search_services')");
-$db->execute("INSERT INTO TRANSLATION VALUES(9,'db_activity_manage_machines')");
-$db->execute("INSERT INTO TRANSLATION VALUES (10,
+$db->execute("INSERT INTO TRANSLATION VALUES (6,
+    'db_activity_manage_classifiers')");
+$db->execute("INSERT INTO TRANSLATION VALUES (7, 'db_activity_file_options')");
+$db->execute("INSERT INTO TRANSLATION VALUES (8,'db_activity_results_editor')");
+$db->execute("INSERT INTO TRANSLATION VALUES(9,'db_activity_search_services')");
+$db->execute("INSERT INTO TRANSLATION VALUES(10,
+    'db_activity_manage_machines')");
+$db->execute("INSERT INTO TRANSLATION VALUES (11,
     'db_activity_manage_locales')");
-$db->execute("INSERT INTO TRANSLATION VALUES (11, 'db_activity_configure')");
+$db->execute("INSERT INTO TRANSLATION VALUES (12, 'db_activity_configure')");


 $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (1, 1, 'Manage Account' )");
@@ -160,12 +166,14 @@ $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (2, 1, 'Manage Users')");
 $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (3, 1, 'Manage Roles')");
 $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (4, 1, 'Manage Crawls')");
 $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (5, 1, 'Mix Crawls')");
-$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (6, 1, 'Page Options')");
-$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (7, 1, 'Results Editor')");
-$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (8, 1, 'Search Sources')");
-$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (9, 1, 'Manage Machines')");
-$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (10, 1, 'Manage Locales')");
-$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (11, 1, 'Configure')");
+$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (6, 1, 'Classifiers')");
+$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (7, 1, 'Page Options')");
+$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (8, 1, 'Results Editor')");
+$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (9, 1, 'Search Sources')");
+$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (10,
+    1, 'Manage Machines')");
+$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (11, 1, 'Manage Locales')");
+$db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (12, 1, 'Configure')");

 $db->execute("INSERT INTO TRANSLATION_LOCALE VALUES (1, 5,
     'Modifier votre compte' )");
diff --git a/controllers/admin_controller.php b/controllers/admin_controller.php
index 5296110cd..9dcf19733 100755
--- a/controllers/admin_controller.php
+++ b/controllers/admin_controller.php
@@ -41,6 +41,8 @@ require_once BASE_DIR."/lib/crawl_constants.php";
 require_once BASE_DIR."/lib/url_parser.php";
 /** Used in rule parser test in page options */
 require_once BASE_DIR."/lib/page_rule_parser.php";
+/** Used to create, update, and delete user-trained classifiers. */
+require_once BASE_DIR."/lib/classifiers/classifier.php";
 /** Loads crawl_daemon to manage news_updater */
 require_once BASE_DIR."/lib/crawl_daemon.php";
 /** get processors for different file types */
@@ -79,9 +81,9 @@ class AdminController extends Controller implements CrawlConstants
      * @var array
      */
     var $activities = array("signin", "manageAccount", "manageUsers",
-        "manageRoles", "manageCrawls", "pageOptions", "resultsEditor",
-        "manageMachines", "manageLocales", "crawlStatus", "mixCrawls",
-        "machineStatus", "searchSources", "configure");
+        "manageRoles", "manageCrawls", "pageOptions", "manageClassifiers",
+        "resultsEditor", "manageMachines", "manageLocales", "crawlStatus",
+        "mixCrawls", "machineStatus", "searchSources", "configure");
     /**
      * An array of activities which are periodically updated within other
      * activities that they live. For example, within manage crawl,
@@ -930,18 +932,22 @@ class AdminController extends Controller implements CrawlConstants
         $crawl_params[self::DISALLOWED_SITES] =
             isset($seed_info['disallowed_sites']['url']) ?
             $seed_info['disallowed_sites']['url'] : array();
-        $crawl_params[self::PAGE_RULES] =
-            isset($seed_info['page_rules']['rule']) ?
-            $seed_info['page_rules']['rule'] : array();
-
-        if(isset($seed_info['indexing_plugins']['plugins'])) {
-            $crawl_params[self::INDEXING_PLUGINS] =
-                $seed_info['indexing_plugins']['plugins'];
-        }
         if(isset($seed_info['indexed_file_types']['extensions'])) {
             $crawl_params[self::INDEXED_FILE_TYPES] =
                 $seed_info['indexed_file_types']['extensions'];
         }
+        if(isset($seed_info['active_classifiers']['label'])) {
+            // Note that 'label' is actually an array of active class labels.
+            $crawl_params[self::ACTIVE_CLASSIFIERS] =
+                $seed_info['active_classifiers']['label'];
+        }
+        if(isset($seed_info['indexing_plugins']['plugins'])) {
+            $crawl_params[self::INDEXING_PLUGINS] =
+                $seed_info['indexing_plugins']['plugins'];
+        }
+        $crawl_params[self::PAGE_RULES] =
+            isset($seed_info['page_rules']['rule']) ?
+            $seed_info['page_rules']['rule'] : array();
     }

     /**
@@ -1025,7 +1031,7 @@ class AdminController extends Controller implements CrawlConstants
             $seed_info = $this->crawlModel->getSeedInfo();
         }
         $page_options_properties = array('indexed_file_types',
-            'page_rules', 'indexing_plugins');
+            'active_classifiers', 'page_rules', 'indexing_plugins');
         //these properties should be changed under page_options not here
         foreach($page_options_properties as $property) {
             if(isset($seed_current[$property])) {
@@ -1437,9 +1443,6 @@ class AdminController extends Controller implements CrawlConstants
      *
      * This activity allows a user to specify the page range size to be
      * be used during a crawl as well as which file types can be downloaded
-     *
-     * @return array $data info about the groups and their contents for a
-     *      particular crawl mix
      */
     function pageOptions()
     {
@@ -1621,6 +1624,30 @@ class AdminController extends Controller implements CrawlConstants
         }
         $seed_info["indexed_file_types"]["extensions"] = $filetypes;

+        $data['CLASSIFIERS'] = array();
+        $active_classifiers = array();
+        foreach (Classifier::getClassifierList() as $classifier) {
+            $label = $classifier->class_label;
+            $ison = false;
+            if (isset($_REQUEST['classifier']) && !$loaded) {
+                if (isset($_REQUEST['classifier'][$label])) {
+                    $ison = true;
+                }
+            } else if (isset($seed_info['active_classifiers']['label'])) {
+                if (in_array($label,
+                    $seed_info['active_classifiers']['label'])) {
+                    $ison = true;
+                }
+            }
+            if ($ison) {
+                $data['CLASSIFIERS'][$label] = 'checked="checked"';
+                $active_classifiers[] = $label;
+            } else {
+                $data['CLASSIFIERS'][$label] = '';
+            }
+        }
+        $seed_info['active_classifiers']['label'] = $active_classifiers;
+
         if(isset($seed_info['page_rules']['rule'])) {
             $data['page_rules'] = $this->convertArrayLines(
                 $seed_info['page_rules']['rule']);
@@ -1773,6 +1800,207 @@ class AdminController extends Controller implements CrawlConstants
         return $data;
     }

+    /**
+     * Handles admin requests for creating, editing, and deleting classifiers.
+     *
+     * This activity implements the logic for the page that lists existing
+     * classifiers, including the actions that can be performed on them.
+     */
+    function manageClassifiers()
+    {
+        $possible_arguments = array('createclassifier', 'editclassifier',
+            'finalizeclassifier', 'deleteclassifier');
+
+        $data['ELEMENT'] = 'manageclassifiersElement';
+        $data['SCRIPT'] = '';
+
+        $machine_urls = $this->machineModel->getQueueServerUrls();
+        $num_machines = count($machine_urls);
+        if ($num_machines < 1 || ($num_machines == 1 &&
+            UrlParser::isLocalhostUrl($machine_urls[0]))) {
+            $machine_urls = NULL;
+        }
+
+        $data['leftorright'] =
+            (getLocaleDirection() == 'ltr') ? 'right': 'left';
+
+        $classifiers = Classifier::getClassifierList();
+
+        if (isset($_REQUEST['arg']) &&
+            in_array($_REQUEST['arg'], $possible_arguments)) {
+            $label = $this->clean($_REQUEST['class_label'], 'string');
+            $label = Classifier::cleanLabel($label);
+            switch ($_REQUEST['arg'])
+            {
+                case 'createclassifier':
+                    if (!isset($classifiers[$label])) {
+                        $classifier = new Classifier($label);
+                        Classifier::setClassifier($classifier);
+                        $classifiers[$label] = $classifier;
+                        $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">".
+                            tl('admin_controller_new_classifier').'</h1>\');';
+                    } else {
+                        $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">".
+                            tl('admin_controller_classifier_exists').
+                            '</h1>\');';
+                    }
+                    break;
+
+                case 'editclassifier':
+                    if (isset($classifiers[$label])) {
+                        $data['class_label'] = $label;
+                        $this->editClassifier($data, $classifiers,
+                            $machine_urls);
+                    } else {
+                        $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">".
+                            tl('admin_controller_no_classifier').
+                            '</h1>\');';
+                    }
+                    break;
+
+                case 'finalizeclassifier':
+                    /*
+                       Finalizing is too expensive to be done directly in the
+                       controller that responds to the web request. Instead, a
+                       daemon is launched to finalize the classifier
+                       asynchronously and save it back to disk when it's done.
+                       In the meantime, a flag is set to indicate the current
+                       finalizing state.
+                     */
+                    CrawlDaemon::start("classifier_trainer", $label, '', -1);
+                    $classifier = $classifiers[$label];
+                    $classifier->finalized = Classifier::FINALIZING;
+                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">".
+                        tl('admin_controller_finalizing_classifier').
+                        '</h1>\');';
+                    break;
+
+                case 'deleteclassifier':
+                    /*
+                       In addition to deleting the classifier, we also want to
+                       delete the associated crawl mix (if one exists) used to
+                       iterate over existing indexes in search of new training
+                       examples.
+                     */
+                    if (isset($classifiers[$label])) {
+                        unset($classifiers[$label]);
+                        Classifier::deleteClassifier($label);
+                        $mix_name = Classifier::getCrawlMixName($label);
+                        $mix_time = $this->crawlModel->getCrawlMixTimestamp(
+                            $mix_name);
+                        if ($mix_time) {
+                            $this->crawlModel->deleteCrawlMixIteratorState(
+                                $mix_time);
+                            $this->crawlModel->deleteCrawlMix($mix_time);
+                        }
+                        $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">".
+                            tl('admin_controller_classifier_deleted').
+                            '</h1>\');';
+                    } else {
+                        $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">".
+                            tl('admin_controller_no_classifier').
+                            '</h1>\');';
+                    }
+                    break;
+            }
+        }
+
+        $data['classifiers'] = $classifiers;
+        return $data;
+    }
+
+    /**
+     * Handles the particulars of editing a classifier, which includes changing
+     * its label and adding training examples.
+     *
+     * This activity directly handles changing the class label, but not adding
+     * training examples. The latter activity is done interactively without
+     * reloading the page via XmlHttpRequests, coordinated by the classifier
+     * controller dedicated to that task.
+     *
+     * @param array $data data to be passed on to the view
+     * @param array $classifiers map from class labels to their associated
+     *     classifiers
+     * @param array $machine_urls string urls of machines managed by this
+     *     Yioop name server
+     */
+    function editClassifier(&$data, $classifiers, $machine_urls)
+    {
+        $data['ELEMENT'] = 'editclassifierElement';
+        $data['INCLUDE_SCRIPTS'] = array('classifiers');
+
+        // We want recrawls, but not archive crawls.
+        $crawls = $this->crawlModel->getCrawlList(false, true, $machine_urls);
+        $data['CRAWLS'] = $crawls;
+
+        $classifier = $classifiers[$data['class_label']];
+
+        if (isset($_REQUEST['update']) && $_REQUEST['update'] == 'update') {
+            if (isset($_REQUEST['rename_label'])) {
+                $new_label = $this->clean($_REQUEST['rename_label'], 'string');
+                $new_label = preg_replace('/[^a-zA-Z0-9_]/', '', $new_label);
+                if (!isset($classifiers[$new_label])) {
+                    $old_label = $classifier['label'];
+                    $classifier['label'] = $new_label;
+                    Classifier::setClassifier($classifier);
+                    Classifier::deleteClassifier($old_label);
+                    $data['class_label'] = $new_label;
+                } else {
+                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\">".
+                        tl('admin_controller_classifier_exists').
+                        '</h1>\');';
+                }
+            }
+        }
+
+        $data['classifier'] = $classifier;
+
+        // Translations for the classification javascript.
+        $data['SCRIPT'] .= "window.tl = {".
+            'editclassifier_load_failed:"'.
+                tl('editclassifier_load_failed').'",'.
+            'editclassifier_loading:"'.
+                tl('editclassifier_loading').'",'.
+            'editclassifier_added_examples:"'.
+                tl('editclassifier_added_examples').'",'.
+            'editclassifier_label_update_failed:"'.
+                tl('editclassifier_label_update_failed').'",'.
+            'editclassifier_updating:"'.
+                tl('editclassifier_updating').'",'.
+            'editclassifier_acc_update_failed:"'.
+                tl('editclassifier_acc_update_failed').'",'.
+            'editclassifier_na:"'.
+                tl('editclassifier_na').'",'.
+            'editclassifier_no_docs:"'.
+                tl('editclassifier_no_docs').'",'.
+            'editclassifier_num_docs:"'.
+                tl('editclassifier_num_docs').'",'.
+            'editclassifier_in_class:"'.
+                tl('editclassifier_in_class').'",'.
+            'editclassifier_not_in_class:"'.
+                tl('editclassifier_not_in_class').'",'.
+            'editclassifier_skip:"'.
+                tl('editclassifier_skip').'",'.
+            'editclassifier_prediction:"'.
+                tl('editclassifier_prediction').'",'.
+            'editclassifier_scores:"'.
+                tl('editclassifier_scores').'"'.
+            '};';
+
+        /*
+           We pass along authentication information to the client, so that it
+           can authenticate any XmlHttpRequests that it makes in order to label
+           documents.
+         */
+        $time = strval(time());
+        $session = md5($time.AUTH_KEY);
+        $data['SCRIPT'] .=
+            "Classifier.initialize(".
+                "'{$data['class_label']}',".
+                "'{$session}',".
+                "'{$time}');";
+    }
+
     /**
      * Handles admin request related to the search filter activity
      *
diff --git a/controllers/classifier_controller.php b/controllers/classifier_controller.php
new file mode 100644
index 000000000..626e5a79a
--- /dev/null
+++ b/controllers/classifier_controller.php
@@ -0,0 +1,351 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage controller
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/** Load base controller class if needed */
+require_once BASE_DIR."/controllers/controller.php";
+/** Loads common constants for web crawling */
+require_once BASE_DIR."/lib/crawl_constants.php";
+/** Loads url_parser to clean resource name */
+require_once BASE_DIR."/lib/url_parser.php";
+/** Loads utilities for webencode and decode. */
+require_once BASE_DIR."/lib/utility.php";
+/** Loads the classifier library for managing user-trained classifiers */
+require_once BASE_DIR."/lib/classifiers/classifier.php";
+/** Loads mix archive iterator to iterate through mixes for classification */
+require_once BASE_DIR."/lib/archive_bundle_iterators/".
+    "mix_archive_bundle_iterator.php";
+
+/**
+ * This class handles XmlHttpRequests to label documents during classifier
+ * construction.
+ *
+ * Searching for new documents to label and add to the training set is a
+ * heavily-interactive operation, so it is implemented using asynchronous
+ * requests to this controller in order to fetch candidates for labeling and
+ * add labels without reloading the classifier edit page. The admin controller
+ * takes care of first displaying the "edit classifier" page, and handles
+ * requests to change a classifier's class label, but this controller handles
+ * the other asynchronous requests issued by the JavaScript on the page.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage controller
+ */
+class ClassifierController extends Controller implements CrawlConstants
+{
+    /**
+     * Models used by this controller
+     * @var array
+     */
+    var $models = array("crawl", "phrase");
+
+    /**
+     * Only outputs JSON data so don't need view
+     * @var array
+     */
+    var $views = array();
+
+    /**
+     * These are the activities supported by this controller
+     * @var array
+     */
+    var $activities = array("classify");
+
+
+    /**
+     * Checks that the request seems to be coming from a legitimate, logged-in
+     * user, then dispatches to the appropriate activity.
+     */
+    function processRequest()
+    {
+        if(!isset($_REQUEST['a']) || !$this->checkRequest()) {return;}
+        $activity = $_REQUEST['a'];
+        if(in_array($activity, $this->activities)) {$this->$activity();}
+    }
+
+    /**
+     * Finds the next document for which to request a label, sometimes first
+     * recording the label that the user selected for the last document. This
+     * method should only be called via an XmlHttpRequest initiated by the edit
+     * classifier JavaScript, and consequently it always writes out
+     * JSON-encoded data, which is easily decoded by the page JavaScript.
+     */
+    function classify()
+    {
+        $arg = $this->clean($_REQUEST['arg'], 'string');
+        $label = $this->clean($_REQUEST['label'], 'string');
+
+        if (isset($_REQUEST['index'])) {
+            $index = $this->clean($_REQUEST['index'], 'int');
+            if (intval($index) == 1) {
+                // TODO Fail in case that there's no current index
+                $index = $this->crawlModel->getCurrentIndexDatabaseName();
+            }
+            $source_type = $this->clean($_REQUEST['type'], 'string');
+            $keywords = $this->clean($_REQUEST['keywords'], 'string');
+        }
+
+        /*
+           The call to prepareToLabel is important; it loads all of the data
+           required to manage the training set from disk, and also determines
+           what will be saved *back* to disk later.
+         */
+        $classifier = Classifier::getClassifier($label);
+        $classifier->prepareToLabel();
+
+        $data = array();
+
+        switch ($arg)
+        {
+            case 'getdocs':
+                /*
+                   Load documents in from a user-specified index, and find the
+                   next best one to label (for 'manual' source type), or label
+                   them all with a single label (for either the 'positive' or
+                   'negative' source types).
+                 */
+                $mix_iterator = $this->buildClassifierCrawlMix(
+                    $label, $index, $keywords);
+                if ($source_type == 'manual') {
+                    $num_docs = $classifier->initBuffer($mix_iterator);
+                    $classifier->computeBufferDensities();
+                    $data['num_docs'] = $num_docs;
+                    list($new_doc, $disagreement) =
+                        $classifier->findNextDocumentToLabel();
+                    if ($new_doc) {
+                        $score = $classifier->classify($new_doc);
+                        $data['new_doc'] = $this->prepareUnlabelledDocument(
+                            $new_doc, $score, $disagreement,
+                            $index, $keywords);
+                    }
+                    Classifier::setClassifier($classifier);
+                } else if ($source_type == 'positive' ||
+                    $source_type == 'negative') {
+                    $doc_label = ($source_type == 'positive') ? 1 : -1;
+                    $add_count = $classifier->addAllDocuments(
+                        $mix_iterator, $doc_label);
+                    if ($add_count > 0) {
+                        /*
+                           Pass true to always update accuracy after adding a
+                           batch of documents all at once.
+                         */
+                        $classifier->train(true);
+                        Classifier::setClassifier($classifier);
+                    }
+                    $data['add_count'] = $add_count;
+                }
+                break;
+
+            case 'addlabel':
+                /*
+                   First label the last candidate document presented to the
+                   user (potentially skipping it instead of actually applying a
+                   label), then pick the next best candidate for labeling.
+                   When skipping a document instead of adding a label, avoid
+                   re-training since the training set hasn't actually changed.
+                 */
+                $doc = $_REQUEST['doc_to_label'];
+                $docid = $this->clean($doc['docid'], 'int');
+                $key = webdecode($this->clean($doc['key'], 'string'));
+                $doc_label = $this->clean($doc['label'], 'int');
+                $mix_iterator = $this->retrieveClassifierCrawlMix($label);
+                $labels_changed = $classifier->labelDocument($key, $doc_label);
+                $num_docs = $classifier->refreshBuffer($mix_iterator);
+                $classifier->computeBufferDensities();
+                $data['num_docs'] = $num_docs;
+                if ($labels_changed) {
+                    $update_accuracy = $classifier->total > 0 &&
+                        $classifier->total % 10 == 0;
+                    $classifier->train($update_accuracy);
+                }
+                list($new_doc, $disagreement) =
+                    $classifier->findNextDocumentToLabel();
+                if ($new_doc) {
+                    $score = $classifier->classify($new_doc);
+                    $data['new_doc'] = $this->prepareUnlabelledDocument(
+                        $new_doc, $score, $disagreement,
+                        $index, $keywords);
+                }
+                Classifier::setClassifier($classifier);
+                break;
+
+            case 'updateaccuracy':
+                /*
+                   Don't do anything other than re-compute the accuracy for the
+                   current training set.
+                 */
+                $classifier->updateAccuracy();
+                Classifier::setClassifier($classifier);
+                break;
+        }
+
+        /*
+           No matter which activity we ended up carrying out, always include
+           the statistics that *might* have changed so that the client can just
+           naively keep them up to date.
+         */
+        $data['positive'] = $classifier->positive;
+        $data['negative'] = $classifier->negative;
+        $data['total'] = $classifier->total;
+        $data['accuracy'] = $classifier->accuracy;
+
+        /*
+           Pass along a new authentication token so that the client can make a
+           new authenticated request after this one.
+         */
+        $data['authTime'] = strval(time());
+        $data['authSession'] = md5($data['authTime'].AUTH_KEY);
+
+        $response = json_encode($data);
+        header("Content-Type: application/json");
+        header("Content-Length: ".strlen($response));
+        echo $response;
+    }
+
+    /* PRIVATE METHODS */
+
+    /**
+     * Creates a new crawl mix for an existing index, with an optional query,
+     * and returns an iterator for the mix. The crawl mix name is derived from
+     * the class label, so that it can be easily retrieved and deleted later
+     * on.
+     *
+     * @param string $label class label of the classifier the new crawl mix
+     *  will be associated with
+     * @param int $crawl_time timestamp of the index to be iterated over
+     * @param string $keywords an optional query used to restrict the pages
+     *  retrieved by the crawl mix
+     * @return object A MixArchiveBundleIterator instance that will iterate
+     *  over the pages of the requested index
+     */
+    function buildClassifierCrawlMix($label, $crawl_time, $keywords)
+    {
+        $mix_time = time();
+        $mix_name = Classifier::getCrawlMixName($label);
+
+        // Replace any existing crawl mix.
+        $old_time = $this->crawlModel->getCrawlMixTimestamp($mix_name);
+        if ($old_time) {
+            $this->crawlModel->deleteCrawlMixIteratorState($old_time);
+            $this->crawlModel->deleteCrawlMix($old_time);
+        }
+
+        $this->crawlModel->setCrawlMix(array(
+            'MIX_TIMESTAMP' => $mix_time,
+            'MIX_NAME' => $mix_name,
+            'GROUPS' => array(
+                array(
+                    'RESULT_BOUND' => 1,
+                    'COMPONENTS' => array(
+                        array(
+                            'CRAWL_TIMESTAMP' => $crawl_time,
+                            'WEIGHT' => 1.0,
+                            'KEYWORDS' => $keywords))))));
+
+        return new MixArchiveBundleIterator($mix_time, $mix_time);
+    }
+
+    /**
+     * Retrieves an iterator for an existing crawl mix. The crawl mix remembers
+     * its previous offset, so that the new iterator picks up where the
+     * previous one left off.
+     *
+     * @param string $label class label of the classifier this crawl mix is
+     *  associated with
+     * @return object new MixArchiveBundleIterator instance that picks up where
+     *  the previous one left off
+     */
+    function retrieveClassifierCrawlMix($label)
+    {
+        $mix_name = Classifier::getCrawlMixName($label);
+        $mix_time = $this->crawlModel->getCrawlMixTimestamp($mix_name);
+        return new MixArchiveBundleIterator($mix_time, $mix_time);
+    }
+
+    /**
+     * Creates a fresh array from an existing page summary array, and augments
+     * it with extra data relevant to the labeling interface on the client.
+     *
+     * @param array $page original page summary array
+     * @param float $score classification score (estimated by the Naive Bayes
+     *  text classification algorithm) for $page
+     * @param float $disagreement disagreement score computed for $page
+     * @param int $crawl_time index the page came from
+     * @param string $keywords query supplied to the crawl mix used to find
+     *  $page
+     * @return array reduced page summary structure containing only the
+     *  information that the client needs to display a summary of the page
+     */
+    function prepareUnlabelledDocument($page, $score, $disagreement,
+        $crawl_time, $keywords)
+    {
+        // Highlight the query keywords, if any.
+        $disjunct_phrases = explode("|", $keywords);
+        $words = array();
+        foreach ($disjunct_phrases as $disjunct_phrase) {
+            list($word_struct, $format_words) =
+                $this->phraseModel->parseWordStructConjunctiveQuery(
+                    $disjunct_phrase);
+            $words = array_merge($words, $format_words);
+        }
+        $title = $this->phraseModel->boldKeywords(
+            $page[self::TITLE], $words);
+        $description = $this->phraseModel->getSnippets(
+            strip_tags($page[self::DESCRIPTION]), $words, 400);
+        $description = $this->phraseModel->boldKeywords(
+            $description, $words);
+        $cache_link = "?c=search&amp;a=cache".
+            "&amp;q=".urlencode($keywords).
+            "&amp;arg=".urlencode($page[self::URL]).
+            "&amp;its=".$crawl_time;
+        /*
+           Note that the confidence is a transformation of the score that
+           converts it into a value between 0 and 1, where it's 0 if the score
+           was exactly 0.5, and increases toward 1 as the score either
+           increases toward 1 or decreases toward 0.
+         */
+        return array(
+            'title' => $title,
+            'url' => $page[self::URL],
+            'key' => webencode(Classifier::makeKey($page)),
+            'cache_link' => $cache_link,
+            'description' => $description,
+            'score' => $score,
+            'positive' => $score >= 0.5 ? 1 :0,
+            'confidence' => abs($score - 0.5) / 0.5,
+            'disagreement' => $disagreement);
+    }
+}
+?>
diff --git a/controllers/fetch_controller.php b/controllers/fetch_controller.php
index af45da888..f36f45115 100755
--- a/controllers/fetch_controller.php
+++ b/controllers/fetch_controller.php
@@ -37,6 +37,8 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
 require_once BASE_DIR."/controllers/controller.php";
 /** Loads common constants for web crawling*/
 require_once BASE_DIR."/lib/crawl_constants.php";
+/** For user-trained classification of page summaries*/
+require_once BASE_DIR."/lib/classifiers/classifier.php";

 /** get available archive iterators */
 foreach(glob(BASE_DIR."/lib/archive_bundle_iterators/*_bundle_iterator.php")
@@ -228,12 +230,20 @@ class FetchController extends Controller implements CrawlConstants
                 $arctype = $info[self::ARC_TYPE];
                 $iterator_name = $arctype."Iterator";

-                if($info[self::ARC_DIR] == "MIX") { //recrawl of crawl mix case
-                    $archive_iterator = new $iterator_name($iterate_timestamp,
-                        $result_timestamp);
-                } else { //any other archive crawl except web archive recrawls
-                    $archive_iterator = new $iterator_name($iterate_timestamp,
-                        $info[self::ARC_DIR], $result_timestamp, $result_dir);
+                if(!class_exists($iterator_name)) {
+                    $info['ARCHIVE_BUNDLE_ERROR'] =
+                        "Invalid bundle iterator: '{$iterator_name}'";
+                } else {
+                    if($info[self::ARC_DIR] == "MIX") {
+                        //recrawl of crawl mix case
+                        $archive_iterator = new $iterator_name(
+                            $iterate_timestamp, $result_timestamp);
+                    } else {
+                        //any other archive crawl except web archive recrawls
+                        $archive_iterator = new $iterator_name(
+                            $iterate_timestamp, $info[self::ARC_DIR],
+                            $result_timestamp, $result_dir);
+                    }
                 }
             }
             $pages = false;
@@ -518,6 +528,18 @@ class FetchController extends Controller implements CrawlConstants
                         $info[$field] = $status[$field];
                     }
                 }
+                /*
+                   When initiating a new crawl AND there are active
+                   classifiers (an array of class labels), then augment the
+                   info with compressed, serialized versions of each active
+                   classifier so that each fetcher can reconstruct the same
+                   classifiers.
+                 */
+                if (isset($status[self::ACTIVE_CLASSIFIERS])) {
+                    $classifiers_data = Classifier::loadClassifiersData(
+                            $status[self::ACTIVE_CLASSIFIERS]);
+                    $info[self::ACTIVE_CLASSIFIERS_DATA] = $classifiers_data;
+                }
             }
         }

diff --git a/controllers/resource_controller.php b/controllers/resource_controller.php
index fa79c0b67..ca3392744 100644
--- a/controllers/resource_controller.php
+++ b/controllers/resource_controller.php
@@ -50,7 +50,7 @@ require_once BASE_DIR."/lib/url_parser.php";
 class ResourceController extends Controller implements CrawlConstants
 {
     /**
-     * No models used by this controller
+     * Models used by this controller
      * @var array
      */
     var $models = array("crawl");
diff --git a/css/search.css b/css/search.css
index 82f34f937..ff20d302b 100755
--- a/css/search.css
+++ b/css/search.css
@@ -1675,6 +1675,9 @@ ul.in-list li
 .mixes-table,
 .mixes-table td,
 .mixes-table th,
+.classifiers-table,
+.classifiers-table td,
+.classifiers-table th,
 .search-page-table,
 .search-page-table td,
 .search-sources-table,
@@ -1687,6 +1690,8 @@ ul.in-list li

 .html-ltr .file-types-table,
 .html-ltr .file-types-all td,
+.html-ltr .classifiers-table,
+.html-ltr .classifiers-all td,
 .html-ltr .search-page-table,
 .html-ltr .search-page-all td
 {
@@ -1817,3 +1822,81 @@ ul.in-list li
     height:0.25in;
     width: 0.25in;
 }
+
+/*
+  Styles for the classifiers settings pages
+ */
+#update-accuracy.disabled
+{
+    color: gray;
+}
+
+#label-docs-form th,
+#label-docs-queue .actions
+{
+    width: 1.0in;
+    padding: 0.1in 0.2in 0.1in 0in;
+    font-weight: normal;
+}
+
+#label-docs-queue .actions
+{
+    padding: 0.1in;
+}
+
+#label-docs-queue
+{
+    font-size: 11pt;
+}
+
+#label-docs-queue td
+{
+    vertical-align: top;
+    padding-top: 10px;
+}
+
+#label-docs-queue .labelled td
+{
+    background-color: #f0f0f0;
+}
+
+#label-docs-queue .notinclass td
+{
+    background-color: #FFE0E0;
+}
+
+#label-docs-queue .inclass td
+{
+    background-color: #F0FFF0;
+}
+
+#label-docs-queue tr.inclass a.inclass,
+#label-docs-queue tr.notinclass a.notinclass,
+#label-docs-queue tr.skip a.skip
+{
+    text-decoration: none;
+    color: black;
+    cursor: default;
+    font-weight: bold;
+}
+
+#label-docs-queue p
+{
+    width: auto;
+    margin: 3px 0px;
+}
+
+#label-docs-queue .info
+{
+    padding-left: 7px;
+}
+
+#label-docs-queue .description
+{
+    color: #666;
+}
+
+#label-docs-queue .description b
+{
+    color: black;
+}
diff --git a/data/default.db b/data/default.db
index 7fd38ac51..7b2a308fe 100644
Binary files a/data/default.db and b/data/default.db differ
diff --git a/index.php b/index.php
index 71b2133d0..dadfe1715 100755
--- a/index.php
+++ b/index.php
@@ -127,7 +127,7 @@ if (function_exists('lcfirst') === false) {

 $available_controllers = array( "admin", "archive",  "cache", "crawl",
     "fetch",  "machine", "resource", "search", "settings", "statistics",
-    "static",);
+    "static", "classifier");
 if(!WEB_ACCESS) {
 $available_controllers = array("admin", "archive", "cache", "crawl", "fetch",
      "machine");
diff --git a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
index 2cb7a4a60..31737221f 100644
--- a/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/mix_archive_bundle_iterator.php
@@ -72,7 +72,14 @@ class MixArchiveBundleIterator extends ArchiveBundleIterator
     var $mix_timestamp;

     /**
-     * count of how far our into the crawl mix we've gone.
+     * Used to hold timestamp of the index archive bundle of output results
+     *
+     * @var int
+     */
+    var $result_timestamp;
+
+    /**
+     * count of how far out into the crawl mix we've gone.
      *
      * @var int
      */
@@ -175,8 +182,8 @@ class MixArchiveBundleIterator extends ArchiveBundleIterator
      */
     function nextPages($num, $no_process = false)
     {
+        $objects = array("NO_PROCESS" => false);
         if($this->end_of_iterator) {
-            $objects = array("NO_PROCESS" => false);
             return $objects;
         }
         $results = $this->searchController->queryRequest($this->query,
@@ -189,7 +196,7 @@ class MixArchiveBundleIterator extends ArchiveBundleIterator
         } else if ($num_results == 0) {
             $this->end_of_iterator = true;
         } else {
-            $objects = array("NO_PROCESS" => $results);
+            $objects['NO_PROCESS'] = $results;
         }
         if(isset($results["SAVE_POINT"]) ){
             $end = true;
diff --git a/lib/classifiers/classifier.php b/lib/classifiers/classifier.php
new file mode 100644
index 000000000..fbad8eeeb
--- /dev/null
+++ b/lib/classifiers/classifier.php
@@ -0,0 +1,1302 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage classifier
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/** Common constants for page summaries */
+require_once BASE_DIR."/lib/crawl_constants.php";
+/** Used for keeping track of the vocabulary and feature map */
+require_once BASE_DIR."/lib/classifiers/features.php";
+/** Used to restrict features to an informative subset */
+require_once BASE_DIR."/lib/classifiers/feature_selection.php";
+/** Naive Bayes text classification algorithm */
+require_once BASE_DIR."/lib/classifiers/naive_bayes.php";
+/** Logistic regression text classification algorithm */
+require_once BASE_DIR."/lib/classifiers/lasso_regression.php";
+/** Used to guess locale from a string */
+require_once BASE_DIR."/lib/locale_functions.php";
+/** Used to tokenize page summaries */
+require_once BASE_DIR."/lib/phrase_parser.php";
+
+/**
+ * The primary interface for building and using classifiers. An instance of
+ * this class represents a single classifier in memory, but the class also
+ * provides static methods to manage classifiers on disk.
+ *
+ * A single classifier is a tool for determining the likelihood that a document
+ * is a positive instance of a particular class. In order to do this, a
+ * classifier goes through a training phase on a labeled training set where it
+ * learns weights for document features (terms, for our purposes). To classify
+ * a new document, the learned weights for all terms in the document are
+ * combined in order to yield a pdeudo-probability that the document belongs to
+ * the class.
+ *
+ * A classifier is composed of a candidate buffer, a training set, a set of
+ * features, and a classification algorithm. In addition to the set of all
+ * features, there is a restricted set of features used for training and
+ * classification. There are also two classification algorithms: a Naive Bayes
+ * algorithm used during labeling, and a logistic regression algorithm used to
+ * train the final classifier. In general, a fresh classifier will first go
+ * through a labeling phase where a collection of labeled training documents is
+ * built up out of existing crawl indexes, and then a finalization phase where
+ * the logistic regression algorithm will be trained on the training set
+ * established in the first phase. After finalization, the classifier may be
+ * used to classify new web pages during a crawl.
+ *
+ * During the labeling phase, the classifier fills a buffer of candidate pages
+ * from the user-selected index (optionally restricted by a query), and tries
+ * to pick the best one to present to the user to be labeled (here `best' means
+ * the one that, once labeled, is most likely to improve classification
+ * accuracy). Each labeled document is removed from the buffer, converted to a
+ * feature vector (described next), and added to the training set. The expanded
+ * training set is then used to train an intermediate Naive Bayes
+ * classification algorithm that is in turn used to more accurately identify
+ * good candidates for the next round of labeling. This phase continues until
+ * the user gets tired of labeling documents, or is happy with the estimated
+ * classification accuracy.
+ *
+ * Instead of passing around terms everywhere, each document that goes into the
+ * training set is first mapped through a Features instance that maps terms to
+ * feature indices (e.g. "Pythagorean" => 1, "theorem" => 2, etc.). These
+ * feature indices are used internally by the classification algorithms, and by
+ * the algorithms that try to pick out the most informative features. In
+ * addition to keeping track of the mapping between terms and feature indices,
+ * a Features instance keeps term and label statistics (such as how often a
+ * term occurs in documents with a particular label) used to weight features
+ * within a document and to select informative features. Finally, subclasses of
+ * the Features class weight features in different ways, presenting more or
+ * less of everything that's known about the frequency or informativeness of a
+ * feature to classification algorithms.
+ *
+ * Once a sufficiently-useful training set has been built, a FeatureSelection
+ * instance is used to choose the most informative features, and copy these
+ * into a reduced Features instance that has a much smaller vocabulary, and
+ * thus a much smaller memory footprint. For efficiency, this is the Features
+ * instance used to train classification algorithms, and to classify web pages.
+ * Finalization is just the process of training a logistic regression
+ * classification algorithm on the full training set. This results in a set of
+ * feature weights that can be used to efficiently assign a psuedo-probability
+ * to the proposition that a new web page is a positive instance of the class
+ * that the classifier has been trained to recognize. Training logistic
+ * regression on a large training set can take a long time, so this phase is
+ * carried out asynchronously, by a daemon launched in response to the
+ * finalization request.
+ *
+ * Because the full Features instance, buffer, and training set are only needed
+ * during the labeling and finalization phases, and because they can get very
+ * large and take up a lot of space in memory, this class separates its large
+ * instance members into separate files when serializing to disk. When a
+ * classifier is first loaded into memory from disk it brings along only its
+ * summary statistics, since these are all that are needed to, for example,
+ * display a list of classifiers. In order to actually add new documents to the
+ * training set, finalize, or classify, the classifier must first be explicitly
+ * told to load the relevant data structures from disk; this is accomplished by
+ * methods like prepareToLabel and prepareToClassify.  These methods load in
+ * the relevant serialized structures, and mark the associated data members for
+ * storage back to disk when (or if) the classifier is serialized again.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+class Classifier implements CrawlConstants
+{
+    /**
+     * The maximum number of candidate documents to consider at once in order
+     * to find the best candidate.
+     */
+    const BUFFER_SIZE = 51;
+
+    /**
+     * The number of Naive Bayes instances to use to calculate disagreement
+     * during candidate selection.
+     */
+    const COMMITTEE_SIZE = 3;
+
+    /**
+     * The maximum disagreement score between candidates. This number depends
+     * on committee size, and is used to provide a slightly more user-friendly
+     * estimate of how much disagreement a document causes (between 0 and 1).
+     */
+    const MAX_DISAGREEMENT = 1.63652; // Depends on committee size
+
+    /**
+     * Lambda parameter used in the computation of a candidate document's
+     * density (smoothing for 0-frequency terms).
+     */
+    const DENSITY_LAMBDA = 0.5;
+
+    /**
+     * Beta parameter used in the computation of a candidate document's density
+     * (sharpness of the KL-divergence).
+     */
+    const DENSITY_BETA = 3.0;
+
+    /**
+     * Threshold used to convert a pseudo-probability to a hard classification
+     * decision. Documents with pseudo-probability >= THRESHOLD are classified
+     * as positive instances.
+     */
+    const THRESHOLD = 0.5;
+
+    /**
+     * Indicates that a classifier needs to be finalized before it can be used.
+     */
+    const UNFINALIZED = 0;
+
+    /**
+     * Indicates that a classifier is currently being finalized (this may take
+     * a while).
+     */
+    const FINALIZING = 1;
+
+    /**
+     * Indicates that a classifier has been finalized, and is ready to be used
+     * for classification.
+     */
+    const FINALIZED = 2;
+
+    /**
+     * Default per-classifier options, which may be overridden when
+     * constructing a new classifier. The supported options are:
+     *
+     *     float density.lambda: Lambda parameter used in the computation of a
+     *         candidate document's density (smoothing for 0-frequency terms).
+     *
+     *     float density.beta: Beta parameter used in the computation of a
+     *         candidate document's density (sharpness of the KL-divergence).
+     *
+     *     int label_fs.max: Use the `label_fs' most informative features to
+     *         train the Naive Bayes classifiers used during labeling to
+     *         compute disagreement for a document.
+     *
+     *     float threshold: Threshold used to convert a pseudo-probability to a
+     *         hard classification decision. Documents with pseudo-probability
+     *         >= `threshold' are classified as positive instances.
+     *
+     *     string final_algo: Algorithm to use for finalization; 'lr' for
+     *         logistic regression, or 'nb' for Naive Bayes; default 'lr'.
+     *
+     *     int final_fs.max: Use the `final_fs' most informative features to
+     *         train the final classifier.
+     *
+     * @var array
+     */
+    var $options = array(
+        'density' => array(
+            'lambda' => 0.5,
+            'beta' => 3.0),
+        'threshold' => 0.5,
+        'label_fs' => array(
+            'max' => 30),
+        'final_fs' => array(
+            'max' => 200),
+        'final_algo' => 'lr');
+
+    /**
+     * The label applied to positive instances of the class learned by this
+     * classifier (e.g., `spam').
+     * @var string
+     */
+    var $class_label;
+
+    /**
+     * Creation time as a UNIX timestamp.
+     * @var int
+     */
+    var $timestamp;
+
+    /**
+     * Language of documents in the training set (also how new documents will
+     * be treated).
+     * @var string
+     */
+    var $lang;
+
+    /**
+     * Whether or not this classifier has had any training examples added to
+     * it, and consequently whether or not its Naive Bayes classification
+     * algorithm has every been trained.
+     * @var bool
+     */
+    var $fresh = true;
+
+    /**
+     * Finalization status, as determined by one of the three finalization
+     * constants.
+     * @var int
+     */
+    var $finalized = 0;
+
+    /**
+     * The number of positive examples in the training set.
+     * @var int
+     */
+    var $positive = 0;
+
+    /**
+     * The number of negative examples in the training set.
+     * @var int
+     */
+    var $negative = 0;
+
+    /**
+     * The total number of examples in the training set (sum of positive and
+     * negative).
+     * @var int
+     */
+    var $total = 0;
+
+    /**
+     * The estimated classification accuracy. This member may be null if the
+     * accuracy has not yet been estimated, or out of date if examples have
+     * been added to the training set since the last accuracy update, but no
+     * new estimate has been computed.
+     * @var float
+     */
+    var $accuracy;
+
+    /*
+       The following properties are all serialized, compressed, and stored in
+       individual files, then loaded on demand.
+    */
+
+    /**
+     * The current pool of candidates for labeling. The first element in the
+     * buffer is always the active document, and as active documents are
+     * labeled and removed, the pool is refreshed with new candidates (if there
+     * are more pages to be drawn from the active index). The buffer is
+     * represented as an associative array with three fields: 'docs', the
+     * candidate page summaries; 'densities', an array of densities computed
+     * for the documents in the candidate pool; and 'stats', statistics about
+     * the terms and documents in the current pool.
+     * @var array
+     */
+    var $buffer;
+
+    /**
+     * The training set, broken up into two fields of an associative array:
+     * 'features', an array of document feature vectors; and 'labels', the
+     * labels assigned to each document.
+     * @var array
+     */
+    var $docs;
+
+    /**
+     * The Features subclass instance used to manage the full set of features
+     * seen across all documents in the training set.
+     * @var object
+     */
+    var $full_features;
+
+    /**
+     * The Features subclass instance used to manage the reduced set of
+     * features used only by Naive Bayes classification algorithms during the
+     * labeling phase.
+     * @var object
+     */
+    var $label_features;
+
+    /**
+     * The NaiveBayes classification algorithm used during training to
+     * tentatively classify documents presented to the user for labeling.
+     * @var object
+     */
+    var $label_algorithm;
+
+    /**
+     * The Features subclass instance used to map documents at classification
+     * time to the feature vectors expected by classification algorithms. This
+     * will generally be a reduced feature set, just like that used during
+     * labeling, but potentially larger than the set used by Naive Bayes.
+     * @var object
+     */
+    var $final_features;
+
+    /**
+     * The finalized classification algorithm that will be used to classify new
+     * web pages. Will usually be logistic regression, but may be Naive Bayes,
+     * if set by the options. During labeling, this field is a reference to the
+     * Naive Bayes classification algorithm (so that that algorithm will be
+     * used by the `classify' method), but it won't be saved to disk as such.
+     * @var object
+     */
+    var $final_algorithm;
+
+    /**
+     * The names of properties set by one of the prepareTo* methods; these
+     * properties will be saved back to disk during serialization, while all
+     * other properties not listed by the __sleep method will be discarded.
+     * @var array
+     */
+    var $loaded_properties = array();
+
+    /* PUBLIC INTERFACE */
+
+    /**
+     * Initializes a new classifier with a class label, and options to override
+     * the defaults. The timestamp associated with the classifier is taken from
+     * the time of construction.
+     *
+     * @param string $label class label applied to positive instances of the
+     *  class this classifier is trained to recognize
+     * @param array $options optional associative array of options that will
+     *  override the default options
+     */
+    function __construct($label, $options = array())
+    {
+        $this->class_label = $label;
+        $this->timestamp = time();
+        $this->options = array_merge($this->options, $options);
+    }
+
+    /**
+     * Magic method that determines which member data will be stored when
+     * serializing this class. Only lightweight summary data are stored with
+     * the serialized version of this class. The heavier-weight properties are
+     * stored in individual, compressed files.
+     *
+     * @return array names of properties to store when serializing this
+     *  instance
+     */
+    function __sleep()
+    {
+        return array(
+            'options',
+            'class_label',
+            'timestamp',
+            'lang',
+            'fresh',
+            'finalized',
+            'positive',
+            'negative',
+            'total',
+            'accuracy');
+    }
+
+    /* PREPARING FOR A TASK */
+
+    /**
+     * Prepare this classifier instance for labeling. This operation requires
+     * all of the heavyweight member data save the final features and
+     * algorithm. Note that these properties are set to references to the
+     * Naive Bayes features and algorithm, so that Naive Bayes will be used to
+     * tentatively classify documents during labeling (purely to give the user
+     * some feedback on how the training set is performing).
+     */
+    function prepareToLabel()
+    {
+        $this->loadProperties('buffer', 'docs', 'full_features',
+            'label_features', 'label_algorithm');
+        if (is_null($this->full_features)) {
+            $this->full_features = new BinaryFeatures();
+        }
+        if (is_null($this->label_algorithm)) {
+            $this->label_algorithm = new NaiveBayes();
+        }
+        if (is_null($this->docs)) {
+            $this->docs = array(
+                'features' => array(),
+                'labels' => array());
+        }
+        $this->final_features = $this->label_features;
+        $this->final_algorithm = $this->label_algorithm;
+    }
+
+    /**
+     * Prepare to train a final classification algorithm on the full training
+     * set. This operation requires the full training set and features, but not
+     * the candidate buffer used during labeling. Note that any existing final
+     * features and classification algorithm are simply zeroed out; they are
+     * only loaded from disk so that they will be written back after
+     * finalization completes.
+     */
+    function prepareToFinalize()
+    {
+        $this->finalized = self::FINALIZING;
+        self::setClassifier($this);
+        $this->loadProperties('docs', 'full_features', 'final_features',
+            'final_algorithm');
+        $this->final_features = NULL;
+        if (strcasecmp($this->options['final_algo'], 'nb') != 0) {
+            $this->final_algorithm = new LassoRegression();
+        } else {
+            $this->final_algorithm = new NaiveBayes();
+        }
+    }
+
+    /**
+     * Prepare to classify new web pages. This operation requires only the
+     * final features and classification algorithm, which are expected to be
+     * defined after the finalization phase.
+     */
+    function prepareToClassify()
+    {
+        $this->loadProperties('final_features', 'final_algorithm');
+    }
+
+    /* LABELING PHASE */
+
+    /**
+     * Updates the buffer and training set to reflect the label given to a new
+     * document. The label may be -1, 1, or 0, where the first two correspond
+     * to a negative or positive example, and the last to a skip. The handling
+     * for a skip is necessarily different from that for a positive or negative
+     * label, and matters are further complicated by the possibility that we
+     * may be changing a label for a document that's already in the training
+     * set, rather than adding a new document. This function returns true if
+     * the new label resulted in a change to the training set, and false
+     * otherwise (i.e., if the user simply skipped labeling the candidate
+     * document).
+     *
+     * When updating an existing document, we will either need to swap the
+     * label in the training set and update the statistics stored by the
+     * Features instance (since now the features are associated with a
+     * different label), or drop the document from the training set and (again)
+     * update the statistics stored by the Features instance. In either case
+     * the negative and positive counts must be updated as well.
+     *
+     * When working with a new document, we need to remove it from the
+     * candidate buffer, and if the label is non-zero then we also need to add
+     * the document to the training set. That involves tokenizing the document,
+     * passing the tokens through the full_features instance, and storing the
+     * resulting feature vector, plus the new label in the docs attribute. The
+     * positive and negative counts must be updated as well.
+     *
+     * Finally, if this operation is occurring active labeling (when the user
+     * is providing labels one at a time), that information needs to be passed
+     * along to dropBufferDoc, which can avoid doing some work in the
+     * non-active case.
+     *
+     * @param string $key key used to select the document from the docs array
+     * @param int $label new label (-1, 1, or 0)
+     * @param bool $is_active whether this operation is being carried out
+     *  during active labeling
+     * @return bool true if the training set was modified, and false otherwise
+     */
+    function labelDocument($key, $label, $is_active = true)
+    {
+        $prev_label = 0;
+        $labels_changed = true;
+        if (isset($this->docs['labels'][$key])) {
+            $prev_label = $this->docs['labels'][$key];
+            if ($label != 0) {
+                $this->full_features->updateExampleLabel(
+                    $this->docs['features'][$key], $prev_label, $label);
+                $this->docs['labels'][$key] = $label;
+                // Effectively increment new label and decrement old.
+                $this->negative += -$label;
+                $this->positive -= -$label;
+            } else {
+                $this->full_features->updateExampleLabel(
+                    $this->docs['features'][$key], $prev_label, 0);
+                unset($this->docs['features'][$key]);
+                unset($this->docs['labels'][$key]);
+                if ($prev_label > 0) {
+                    $this->positive--;
+                } else {
+                    $this->negative--;
+                }
+            }
+        } else if ($label == 0) {
+            $labels_changed = false;
+            $this->dropBufferDoc($is_active);
+        } else {
+            if ($label > 0) {
+                $this->positive++;
+            } else {
+                $this->negative++;
+            }
+            $doc = $this->buffer['docs'][0];
+            $features = $this->full_features->addExample(
+                $doc['TERMS'], $label);
+            $this->docs['features'][$key] = $features;
+            $this->docs['labels'][$key] = $label;
+            $this->dropBufferDoc($is_active);
+        }
+        $this->total = $this->negative + $this->positive;
+        $this->fresh = false;
+        if ($labels_changed) {
+            $this->finalized = self::UNFINALIZED;
+        }
+        return $labels_changed;
+    }
+
+    /**
+     * Iterates entirely through a crawl mix iterator, adding each document
+     * (that hasn't already been labeled) to the training set with a single
+     * label. This function works by running through the iterator, filling up
+     * the candidate buffer with all unlabeled documents, then repeatedly
+     * dropping the first buffer document and adding it to the training set.
+     * Returns the total number of newly-labeled documents.
+     *
+     * @param object $mix_iterator crawl mix iterator to draw documents from
+     * @param int $label label to apply to every document; -1 or 1, but NOT 0
+     * @param int $limit optional upper bound on the number of documents to
+     *  add; defaults to no limit
+     * @return int total number of newly-labeled documents
+     */
+    function addAllDocuments($mix_iterator, $label, $limit = INF) {
+        $count = $this->initBuffer($mix_iterator, 0);
+        while (!$mix_iterator->end_of_iterator && $count < $limit) {
+            $new_pages = $mix_iterator->nextPages(500);
+            if (isset($new_pages['NO_PROCESS'])) {
+                unset($new_pages['NO_PROCESS']);
+            }
+            $num_pages = 0;
+            while ($count + $num_pages < $limit &&
+                (list($i, $page) = each($new_pages))) {
+                $key = self::makeKey($page);
+                if (!isset($this->docs['labels'][$key])) {
+                    $this->addBufferDoc($page, false);
+                    $num_pages++;
+                }
+            }
+            for ($i = $num_pages; $i > 0; $i--) {
+                $key = self::makeKey($this->buffer['docs'][0]);
+                $this->labelDocument($key, $label, false);
+            }
+            $count += $num_pages;
+        }
+        return $count;
+    }
+
+    /**
+     * Drops any existing candidate buffer, re-initializes the buffer
+     * structure, then calls refreshBuffer to fill it. Takes an optional buffer
+     * size, which can be used to limit the buffer to something other than the
+     * number imposed by the runtime parameter. Returns the final buffer size.
+     *
+     * @param object $mix_iterator crawl mix iterator to draw documents from
+     * @param int $buffer_size optional buffer size to use; defaults to the
+     *  runtime parameter
+     * @return int final buffer size
+     */
+    function initBuffer($mix_iterator, $buffer_size = NULL)
+    {
+        $this->buffer = array(
+            'docs' => array(),
+            'densities' => array(),
+            'stats' => array(
+                'terms' => array(),
+                'num_tokens' => 0,
+                'docs' => array(),
+                'num_docs' => 0
+            )
+        );
+        return $this->refreshBuffer($mix_iterator, $buffer_size);
+    }
+
+    /**
+     * Adds as many new documents to the candidate buffer as necessary to reach
+     * the specified buffer size, which defaults to the runtime parameter.
+     * Returns the final buffer size, which may be less than that requested if
+     * the iterator doesn't return enough documents.
+     *
+     * @param object $mix_iterator crawl mix iterator to draw documents from
+     * @param int $buffer_size optional buffer size to use; defaults to the
+     *  runtime parameter
+     * @return int final buffer size
+     */
+    function refreshBuffer($mix_iterator, $buffer_size = NULL)
+    {
+        if (is_null($buffer_size)) {
+            $buffer_size = self::BUFFER_SIZE;
+        }
+        $num_pages = count($this->buffer['docs']);
+        while ($num_pages < $buffer_size &&
+            !$mix_iterator->end_of_iterator) {
+            $batch_size = $buffer_size - $num_pages;
+            $new_pages = $mix_iterator->nextPages($batch_size);
+            if (isset($new_pages['NO_PROCESS'])) {
+                unset($new_pages['NO_PROCESS']);
+            }
+            foreach ($new_pages as $page) {
+                $key = self::makeKey($page);
+                if (!isset($this->docs['labels'][$key])) {
+                    $this->addBufferDoc($page);
+                    $num_pages++;
+                }
+            }
+        }
+        return $num_pages;
+    }
+
+    /**
+     * Computes from scratch the buffer densities of the documents in the
+     * current candidate pool. This is an expensive operation that requires
+     * the computation of the KL-divergence between each ordered pair of
+     * documents in the pool, approximately O(N^2) computations, total (where N
+     * is the number of documents in the pool). The densities are saved in the
+     * buffer data structure.
+     *
+     * The density of a document is approximated by its average overlap with
+     * every other document in the candidate buffer, where the overlap between
+     * two documents is itself approximated using the exponential, negative
+     * KL-divergence between them. The KL-divergence is smoothed to deal with
+     * features (terms) that occur in one distribution (document) but not the
+     * other, and then multiplied by a negative constant and exponentiated in
+     * order to convert it to a kind of linear overlap score.
+     */
+    function computeBufferDensities()
+    {
+        $this->buffer['densities'] = array();
+        $densities =& $this->buffer['densities'];
+        $stats =& $this->buffer['stats'];
+        $num_docs = $this->buffer['stats']['num_docs'];
+        foreach ($stats['docs'] as $i => $doc_i) {
+            $sum_i = 0.0;
+            foreach ($stats['docs'] as $h => $doc_h) {
+                if ($h == $i) {
+                    continue;
+                }
+                $sum_ih = 0.0;
+                foreach ($doc_h as $t => $doc_h_t) {
+                    $p = $doc_h_t;
+                    $q = self::DENSITY_LAMBDA *
+                        (isset($doc_i[$t]) ? $doc_i[$t] : 0.0) +
+                        (1.0 - self::DENSITY_LAMBDA) *
+                        $stats['terms'][$t] / $stats['num_tokens'];
+                    $sum_ih += $p * log($p / $q);
+                }
+                $sum_i += -self::DENSITY_BETA * $sum_ih;
+            }
+            $densities[] = exp($sum_i / $stats['num_docs']);
+        }
+    }
+
+    /**
+     * Finds the next best document for labeling amongst the documents in the
+     * candidate buffer, moves that candidate to the front of the buffer, and
+     * returns it.  The best candidate is the one with the maximum product of
+     * disagreement and density, where the density has already been calculated
+     * for each document in the current pool, and the disagreement is the
+     * KL-divergence between the classification scores obtained from a
+     * committee of Naive Bayes classifiers, each sampled from the current
+     * set of features.
+     *
+     * @return array two-element array containing first the best candidate, and
+     *  second the disagreement score, obtained by dividing the disagreement
+     *  for the document by the maximum disagreement possible for the committee
+     *  size
+     */
+    function findNextDocumentToLabel()
+    {
+        if (empty($this->buffer['docs'])) {
+            return array(NULL, 0.0);
+        } else if ($this->fresh) {
+            return array($this->buffer['docs'][0], 0.0);
+        }
+        $num_documents = count($this->buffer['docs']);
+        $doc_ps = array_fill(0, $num_documents, array());
+        for ($k = 0; $k < self::COMMITTEE_SIZE; $k++) {
+            $m = new NaiveBayes();
+            $m->sampleBeta($this->label_features);
+            foreach ($this->buffer['docs'] as $i => $page) {
+                $x = $this->label_features->mapDocument($page['TERMS']);
+                $doc_ps[$i][$k] = $m->classify($x);
+            }
+        }
+        $max_disagreement = -INF;
+        $max_score = -INF;
+        $best_i = 0;
+        $densities =& $this->buffer['densities'];
+        foreach ($doc_ps as $i => $ps) {
+            $kld = 1.0 + self::klDivergenceToMean($ps);
+            $score = $kld * $densities[$i];
+            if ($score > $max_score) {
+                $max_disagreement = $kld;
+                $max_score = $score;
+                $best_i = $i;
+            }
+        }
+        $doc = $this->buffer['docs'][$best_i];
+        $this->moveBufferDocToFront($best_i);
+        return array($doc, $max_disagreement / self::MAX_DISAGREEMENT);
+    }
+
+    /**
+     * Trains the Naive Bayes classification algorithm used during labeling on
+     * the current training set, and optionally updates the estimated accuracy.
+     *
+     * @param bool update_accuracy optional parameter specifying whether or not
+     *  to update the accuracy estimate after training completes; defaults to
+     *  false
+     */
+    function train($update_accuracy = false)
+    {
+        $this->label_features = $this->full_features->restrict(
+            new ChiSquaredFeatureSelection($this->options['label_fs']));
+        $this->final_features = $this->label_features;
+        $X = $this->label_features->mapTrainingSet($this->docs['features']);
+        $y = array_values($this->docs['labels']);
+        $this->label_algorithm->train($X, $y);
+        if ($update_accuracy) {
+            $this->updateAccuracy($X, $y);
+        }
+    }
+
+    /**
+     * Estimates current classification accuracy using a Naive Bayes
+     * classification algorithm. Accuracy is estimated by splitting the current
+     * training set into fifths, reserving four fifths for training, and the
+     * remaining fifth for testing. A fresh classifier is trained and tested
+     * on these splits, and the total accuracy recorded. Then the splits are
+     * rotated so that the previous testing fifth becomes part of the training
+     * set, and one of the blocks from the previous training set becomes the
+     * testing set. A new classifier is trained and tested on the new splits,
+     * and, again, the accuracy recorded. This process is repeated until all
+     * blocks have been used for testing, and the average accuracy recorded.
+     *
+     * @param object $X optional sparse matrix representing the already-mapped
+     *  training set to use; if not provided, the current training set is
+     *  mapped using the label_features property
+     * @param array $y optional array of document labels corresponding to the
+     *  training set; if not provided the current training set labels are used
+     */
+    function updateAccuracy($X = NULL, $y = NULL)
+    {
+        if (is_null($X)) {
+            $X = $this->label_features->mapTrainingSet(
+                $this->docs['features']);
+        }
+        // Round $m down to nearest multiple of 10, and limit to 250 examples.
+        $m = min(250, intval(floor($X->rows() / 10)) * 10);
+        if ($m < 10) {
+            return;
+        }
+        if (is_null($y)) {
+            $y = array_values($this->docs['labels']);
+        }
+        $indices = array_rand($y, $m);
+        shuffle($indices);
+        $fold_size = $m / 5;
+        $divide = 4 * $fold_size;
+        $sum = 0.0;
+        for ($i = 0; $i < 5; $i++) {
+            if ($i > 0) {
+                $last_block = array_splice($indices, $divide);
+                array_splice($indices, 0, 0, $last_block);
+            }
+            $train_indices = array_slice($indices, 0, $divide);
+            sort($train_indices);
+            $test_indices = array_slice($indices, $divide);
+            sort($test_indices);
+            list($train_X, $test_X) = $X->partition(
+                $train_indices, $test_indices);
+            $train_y = array();
+            foreach ($train_indices as $ii) {
+                $train_y[] = $y[$ii];
+            }
+            $test_y = array();
+            foreach ($test_indices as $ii) {
+                $test_y[] = $y[$ii];
+            }
+            $nb = new NaiveBayes();
+            $nb->train($train_X, $train_y);
+            $correct = 0;
+            foreach ($test_X as $ii => $x) {
+                $label = $nb->classify($x) >= 0.5 ? 1 : -1;
+                if ($label == $test_y[$ii]) {
+                    $correct++;
+                }
+            }
+            $sum += $correct / count($test_y);
+        }
+        $this->accuracy = $sum / 5;
+    }
+
+    /* FINALIZATION PHASE */
+
+    /**
+     * Trains the final classification algorithm on the full training set,
+     * using a subset of the full feature set. The final algorithm will usually
+     * be logistic regression, but can be set to Naive Bayes with the
+     * appropriate runtime option. Once finalization completes, updates the
+     * `finalized' attribute.
+     */
+    function finalize()
+    {
+        $this->final_features = $this->full_features->restrict(
+            new ChiSquaredFeatureSelection($this->options['final_fs']));
+        $X = $this->final_features->mapTrainingSet($this->docs['features']);
+        $y = array_values($this->docs['labels']);
+        $this->final_algorithm->train($X, $y);
+        $this->finalized = self::FINALIZED;
+    }
+
+    /* CLASSIFICATION PHASE */
+
+    /**
+     * Classifies a page summary using the current final classification
+     * algorithm and features, and returns the classification score. This
+     * method is also used during the labeling phase to provide a tentative
+     * label for candidates, and in this case the final algorithm is actually a
+     * reference to a Naive Bayes instance and final_features is a reference to
+     * label_features; neither of these gets saved to disk, however.
+     *
+     * @param array $page page summary array for the page to be classified
+     * @return float pseudo-probability that the page is a positive instance of
+     *  the target class
+     */
+    function classify($page)
+    {
+        /*
+           Without any features (i.e., no training) there's no support for
+           either label, so we assume that the score is close to neutral, but
+           just beneath the threshold.
+        */
+        if ($this->fresh) {
+            return max(self::THRESHOLD - 1.0E-8, 0.0);
+        }
+        $doc = $this->tokenizeDescription($page[self::DESCRIPTION]);
+        $x = $this->final_features->mapDocument($doc);
+        return $this->final_algorithm->classify($x);
+    }
+
+    /* PRIVATE INTERFACE */
+
+    /**
+     * Adds a page to the end of the candidate buffer, keeping the associated
+     * statistics up to date. During active training, each document in the
+     * buffer is tokenized, and the terms weighted by frequency; the term
+     * frequencies across documents in the buffer are tracked as well. With no
+     * active training, the buffer is simply an array of page summaries.
+     *
+     * @param array $page page summary for the document to add to the buffer
+     * @param bool $is_active whether this operation is part of active
+     *  training, in which case some extra statistics must be maintained
+     */
+    function addBufferDoc($page, $is_active = true)
+    {
+        $page['TERMS'] = $this->tokenizeDescription($page[self::DESCRIPTION]);
+        $this->buffer['docs'][] = $page;
+        if ($is_active) {
+            $doc = array();
+            $doc_length = 0;
+            foreach ($page['TERMS'] as $term => $count) {
+                $doc[$term] = $count;
+                $doc_length += $count;
+                if (!isset($this->buffer['stats']['terms'][$term])) {
+                    $this->buffer['stats']['terms'][$term] = $count;
+                } else {
+                    $this->buffer['stats']['terms'][$term] += $count;
+                }
+                $this->buffer['stats']['num_tokens'] += $count;
+            }
+            foreach ($doc as &$term_count) {
+                $term_count /= $doc_length;
+            }
+            $this->buffer['stats']['docs'][] = $doc;
+            $this->buffer['stats']['num_docs']++;
+        }
+    }
+
+    /**
+     * Removes the document at the front of the candidate buffer. During active
+     * training the cross-document statistics for terms occurring in the
+     * document being removed are maintained.
+     *
+     * @param bool $is_active whether this operation is part of active
+     *  training, in which case some extra statistics must be maintained
+     */
+    function dropBufferDoc($is_active = true)
+    {
+        $page = array_shift($this->buffer['docs']);
+        if ($is_active) {
+            foreach ($page['TERMS'] as $term => $count) {
+                $this->buffer['stats']['terms'][$term] -= $count;
+                $this->buffer['stats']['num_tokens'] -= $count;
+            }
+            array_shift($this->buffer['stats']['docs']);
+            $this->buffer['stats']['num_docs']--;
+        }
+    }
+
+    /**
+     * Moves a document in the candidate buffer up to the front, in preparation
+     * for a label request. The document is specified by its index in the
+     * buffer.
+     *
+     * @param int $i document index within the candidate buffer
+     */
+    function moveBufferDocToFront($i)
+    {
+        list($doc) = array_splice($this->buffer['docs'], $i, 1);
+        array_unshift($this->buffer['docs'], $doc);
+        list($doc) = array_splice($this->buffer['stats']['docs'], $i, 1);
+        array_unshift($this->buffer['stats']['docs'], $doc);
+    }
+
+    /**
+     * Tokenizes a string into a map from terms to within-string frequencies.
+     *
+     * @param string $description string to tokenize
+     * @return array associative array mapping terms to their within-string
+     *  frequencies
+     */
+    function tokenizeDescription($description)
+    {
+        /*
+           For now, adopt a very simple tokenizing strategy because
+           extractPhrasesInLists is very slow.
+         */
+        $tokens = preg_split('/\s+/', $description);
+        $out = array();
+        foreach ($tokens as $token) {
+            if (!$token)
+                continue;
+            if (!isset($out[$token])) {
+                $out[$token] = 1;
+            } else {
+                $out[$token]++;
+            }
+        }
+        return $out;
+        /*
+        if (is_null($this->lang)) {
+            $this->lang = guessLocaleFromString($description);
+        }
+        $phrases = PhraseParser::extractPhrasesInLists(
+            $description, $this->lang);
+        $phrase_counts = array();
+        foreach ($phrases as $phrase => $pos_list) {
+            $phrase_counts[$phrase] = count($pos_list);
+        }
+        return $phrase_counts;
+        */
+    }
+
+    /**
+     * Loads class attributes from compressed, serialized files on disk, and
+     * stores their names so that they will be saved back to disk later. Each
+     * property (if it has been previously set) is stored in its own file under
+     * the classifier's data directory, named after the property. The file is
+     * compressed using gzip, but without gzip headers, so it can't actually be
+     * decompressed by the standard gzip utility. If a file doesn't exist, then
+     * the instance property is left untouched. The property names are passed
+     * as a variable number of arguments.
+     *
+     * @param string $property_name,... variably-sized list of property names
+     *  to try to load data for
+     */
+    function loadProperties(/* args... */)
+    {
+        $properties = func_get_args();
+        foreach ($properties as $property_name) {
+            $this->$property_name = NULL;
+            $filename = WORK_DIRECTORY."/classifiers/".$this->class_label.
+                "/".$property_name.".txt";
+            if (file_exists($filename)) {
+                $serialized_data = gzuncompress(file_get_contents($filename));
+                $data = unserialize($serialized_data);
+                $this->$property_name = $data;
+            }
+        }
+        $this->loaded_properties = $properties;
+    }
+
+    /**
+     * Stores the data associated with each property name listed in the
+     * loaded_properties instance attribute back to disk. The data for each
+     * property is stored in its own serialized and compressed file, and made
+     * world-writable.
+     */
+    function storeLoadedProperties()
+    {
+        $properties = $this->loaded_properties;
+        foreach ($properties as $property_name) {
+            $filename = WORK_DIRECTORY."/classifiers/".$this->class_label.
+                "/".$property_name.".txt";
+            $serialized_data = serialize($this->$property_name);
+            file_put_contents($filename, gzcompress($serialized_data));
+            chmod($filename, 0777);
+        }
+    }
+
+    /* PUBLIC STATIC INTERFACE */
+
+    /**
+     * Given a page summary (passed by reference) and a list of classifiers,
+     * augments the summary meta words with the class label of each classifier
+     * that scores the summary above a threshold. This static method is used by
+     * fetchers to classify downloaded pages. In addition to the class label,
+     * the pseudo-probability that the document belongs to the class is
+     * recorded as well. This is recorded both as the score rounded down to the
+     * nearest multiple of ten, and as "<n>plus" for each multiple of ten, n,
+     * less than the score and greater than or equal to the threshold.
+     *
+     * As an example, suppose that a classifier with class label `label' has
+     * determined that a document is a positive example with pseudo-probability
+     * 0.87 and threshold 0.5. The following meta words are added to the
+     * summary: class:label, class:label:80, class:label:80plus,
+     * class:label:70plus, class:label:60plus, and class:label:50plus.
+     *
+     * @param array $summary page summary to classify, passed by reference
+     * @param array $classifiers list of Classifier instances, each prepared
+     *  for classifying (via the prepareToClassify method)
+     */
+    static function labelPage(&$summary, $classifiers)
+    {
+        foreach ($classifiers as $classifier) {
+            $score = $classifier->classify($summary);
+            if ($score >= self::THRESHOLD) {
+                if (!isset($summary[self::META_WORDS])) {
+                    $summary[self::META_WORDS] = array();
+                }
+                $score = intval(floor(($score * 100) / 10) * 10);
+                $label_score = sprintf("%d", floor($score / 10) * 1000);
+                $label = $classifier->class_label;
+                $summary[self::META_WORDS][] = "class:{$label}";
+                $summary[self::META_WORDS][] = "class:{$label}:{$label_score}";
+                $min_score = intval(self::THRESHOLD * 100);
+                for ($s = $score; $s >= $min_score; $s -= 10) {
+                    $summary[self::META_WORDS][] = "class:{$label}:{$s}plus";
+                }
+            }
+        }
+    }
+
+    /**
+     * Returns an array of classifier instances currently stored in the
+     * classifiers directory. The array maps class labels to their
+     * corresponding classifiers, and each classifier is a minimal instance,
+     * containing only summary statistics.
+     *
+     * @return array associative array of class labels mapped to their
+     *  corresponding classifier instances
+     */
+    static function getClassifierList()
+    {
+        $classifiers = array();
+        $dirname = WORK_DIRECTORY."/classifiers";
+        foreach (glob($dirname."/*") as $classifier_dir) {
+            $classifier_file = $classifier_dir."/classifier.txt";
+            $serialized_data = file_get_contents($classifier_file);
+            $classifier = unserialize($serialized_data);
+            $classifiers[$classifier->class_label] = $classifier;
+        }
+        return $classifiers;
+    }
+
+    /**
+     * Returns the minimal classifier instance corresponding to a class label,
+     * or NULL if no such classifier exists on disk.
+     *
+     * @param string $label classifier's class label
+     * @return object classifier instance with the relevant class label, or
+     *  NULL if no such classifier exists on disk
+     */
+    static function getClassifier($label)
+    {
+        $filename = WORK_DIRECTORY."/classifiers/{$label}/classifier.txt";
+        if (file_exists($filename)) {
+            $serialized_data = file_get_contents($filename);
+            return unserialize($serialized_data);
+        }
+        return NULL;
+    }
+
+    /**
+     * Given a list of class labels, returns an array mapping each class label
+     * to an array of data necessary for initializing a classifier for that
+     * label. This static method is used to prepare a collection of classifiers
+     * for distribution to fetchers, so that each fetcher can classify pages as
+     * it downloads them. The only extra properties passed along in addition to
+     * the base classification data are the final features and final algorithm,
+     * both necessary for classifying new documents.
+     *
+     * @param array $labels flat array of class labels for which to load data
+     * @return array associative array mapping class labels to arrays of data
+     *  necessary for initializing the associated classifier
+     */
+    static function loadClassifiersData($labels)
+    {
+        $fields = array('classifier', 'final_features', 'final_algorithm');
+        $classifiers_data = array();
+        foreach ($labels as $label) {
+            $basedir = WORK_DIRECTORY."/classifiers/{$label}";
+            $classifier_data = array();
+            foreach ($fields as $field) {
+                $filename = "{$basedir}/{$field}.txt";
+                if (file_exists($filename)) {
+                    /*
+                       The data is web-encoded because it will be sent in an
+                       HTTP response to each fetcher as it prepares for a new
+                       crawl.
+                     */
+                    $classifier_data[$field] = webencode(
+                        file_get_contents($filename));
+                } else {
+                    $classifier_data = false;
+                    break;
+                }
+            }
+            $classifiers_data[$label] = $classifier_data;
+        }
+        return $classifiers_data;
+    }
+
+    /**
+     * The dual of loadClassifiersData, this static method reconstitutes a
+     * Classifier instance from an array containing the necessary data. This
+     * gets called by each fetcher, using the data that it receives from the
+     * name server when establishing a new crawl.
+     *
+     * @param array $data associative array mapping property names to their
+     *  serialized and compressed data
+     * @return object Classifier instance built from the passed-in data
+     */
+    static function newClassifierFromData($data)
+    {
+        if (!isset($data['classifier'])) {
+            return NULL;
+        }
+        $classifier = unserialize(webdecode($data['classifier']));
+        unset($data['classifier']);
+        foreach ($data as $field => $field_data) {
+            $field_data = webdecode($field_data);
+            $serialized_data = gzuncompress($field_data);
+            $classifier->$field = unserialize($serialized_data);
+        }
+        $classifier->loaded_properties = array_keys($data);
+        return $classifier;
+    }
+
+    /**
+     * Stores a classifier instance to disk, first separating it out into
+     * individual files containing serialized and compressed property data. The
+     * basic classifier information, such as class label and summary
+     * statistics, is stored uncompressed in a file called `classifier.txt'.
+     * The classifier directory and all of its contents are made world-writable
+     * so that they can be manipulated without hassle from the command line.
+     *
+     * @param object Classifier instance to store to disk
+     */
+    static function setClassifier($classifier)
+    {
+        $dirname = WORK_DIRECTORY."/classifiers/".$classifier->class_label;
+        if (!file_exists($dirname)) {
+            mkdir($dirname);
+            chmod($dirname, 0777);
+        }
+        $classifier->storeLoadedProperties();
+        $label = $classifier->class_label;
+        $filename = $dirname."/classifier.txt";
+        $serialized_data = serialize($classifier);
+        file_put_contents($filename, $serialized_data);
+        chmod($filename, 0777);
+    }
+
+    /**
+     * Deletes the directory corresponding to a class label, and all of its
+     * contents. In the case that there is no classifier with the passed in
+     * label, does nothing.
+     *
+     * @param string $label class label of the classifier to be deleted
+     */
+    static function deleteClassifier($label)
+    {
+        $dirname = WORK_DIRECTORY."/classifiers/{$label}";
+        if (file_exists($dirname)) {
+            $db_class = ucfirst(DBMS)."Manager";
+            $db = new $db_class();
+            $db->unlinkRecursive($dirname);
+        }
+    }
+
+    /**
+     * Removes all but alphanumeric characters and underscores from a label, so
+     * that it may be easily saved to disk and used in queries as a meta word.
+     *
+     * @param string $label class label to clean
+     */
+    static function cleanLabel($label)
+    {
+        return preg_replace('/[^a-zA-Z0-9_]/', '', $label);
+    }
+
+    /**
+     * Returns a name for the crawl mix associated with a class label.
+     *
+     * @param string $label class label associated with the crawl mix
+     * @return string name that can be used for the crawl mix associated with
+     *  $label
+     */
+    static function getCrawlMixName($label)
+    {
+        return 'CLASSIFY_'.$label;
+    }
+
+    /**
+     * Returns a key that can be used internally to refer internally to a
+     * particular page summary.
+     *
+     * @param array $page page summary to return a key for
+     * @return string key that uniquely identifies the page summary
+     */
+    static function makeKey($page)
+    {
+        return md5($page[self::URL]);
+    }
+
+    /* PRIVATE STATIC INTERFACE */
+
+    /**
+     * Calculates the KL-divergence to the mean for a collection of discrete
+     * two-element probability distributions. Each distribution is specified by
+     * a single probability, p, since the second probability is just 1 - p. The
+     * KL-divergence to the mean is used as a measure of disagreement between
+     * members of a committee of classifiers, where each member assigns a
+     * classification score to the same document.
+     *
+     * @param array $ps probabilities describing several discrete two-element
+     *  probability distributions
+     * @return float KL-divergence to the mean for the collection of
+     *  distributions
+     */
+    static function klDivergenceToMean($ps)
+    {
+        $k = count($ps);
+        $mean = array_sum($ps) / $k;
+        $mean = max(min($mean, 1.0 - 1.0E-8), 1.0E-8);
+        $kld = 0.0;
+        foreach ($ps as $p) {
+            $p = max(min($p, 1.0 - 1.0E-8), 1.0E-8);
+            $kld += $p * log($p / $mean);
+            $kld += (1 - $p) * log((1 - $p) / (1 - $mean));
+        }
+        return $kld / $k;
+    }
+}
+?>
\ No newline at end of file
diff --git a/lib/classifiers/classifier_algorithm.php b/lib/classifiers/classifier_algorithm.php
new file mode 100644
index 000000000..c73b49c1e
--- /dev/null
+++ b/lib/classifiers/classifier_algorithm.php
@@ -0,0 +1,59 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage classifier
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * An abstract class shared by classification algorithms that implement a
+ * common interface.
+ *
+ * This base class implements a few administrative utility methods that all
+ * classification algorithms can take advantage of.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+abstract class ClassifierAlgorithm
+{
+    // TODO: Add in automatic parameter setting, and better logging facilities,
+    // similar to those used by classifier_tool.
+    var $debug = 0;
+
+    function log($message)
+    {
+        if ($this->debug > 0) {
+            crawlLog($message);
+        }
+    }
+}
diff --git a/lib/classifiers/feature_selection.php b/lib/classifiers/feature_selection.php
new file mode 100644
index 000000000..c5c236616
--- /dev/null
+++ b/lib/classifiers/feature_selection.php
@@ -0,0 +1,176 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage classifier
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * This is an abstract class that specifies an interface for selecting top
+ * features from a dataset.
+ *
+ * Each FeatureSelection class implements a select method that takes a Features
+ * instance and returns a mapping from a subset of the old feature indices to
+ * new ones.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+abstract class FeatureSelection
+{
+    /**
+     * Sets any passed runtime parameters.
+     *
+     * @param array $parameters optional associative array of parameters to
+     *  replace the default ones with
+     */
+    function __construct($parameters = array())
+    {
+        foreach ($parameters as $parameter => $value) {
+            $this->$parameter = $value;
+        }
+    }
+
+    /**
+     * Constructs a map from old feature indices to new ones according to a
+     * max-heap of the most informative features. Always keep feature index 0,
+     * which is used as an intercept term.
+     *
+     * @param object $selected max heap containing entries ordered by
+     *  informativeness and feature index.
+     * @return array associative array mapping a subset of the original feature
+     *  indices to the new indices
+     */
+    function buildMap($selected)
+    {
+        $keep_features = array(0 => 0);
+        $i = 1;
+        while (!$selected->isEmpty()) {
+            list($chi2, $j) = $selected->extract();
+            $keep_features[$j] = $i++;
+        }
+        return $keep_features;
+    }
+
+    /**
+     * Computes the top features of a Features instance, and returns a mapping
+     * from a subset of those features to new contiguous indices. The mapping
+     * allows documents that have already been mapped into the larger feature
+     * space to be converted to the smaller feature space, while keeping the
+     * feature indices contiguous (e.g., 1, 2, 3, 4, ... instead of 22, 35, 75,
+     * ...).
+     *
+     * @param object $features Features instance
+     * @return array associative array mapping a subset of the original feature
+     *  indices to new indices
+     */
+    abstract function select(Features $features);
+}
+
+
+/**
+ * A subclass of FeatureSelection that implements chi-squared feature
+ * selection.
+ *
+ * This feature selection method scores each feature according to its
+ * informativeness, then selects the top N most informative features, where N
+ * is a run-time parameter.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+class ChiSquaredFeatureSelection extends FeatureSelection
+{
+    /**
+     * The maximum number of features to select, a runtime parameter.
+     * @var int
+     */
+    var $max;
+
+    /**
+     * Uses the chi-squared feature selection algorithm to rank features by
+     * informativeness, and return a map from old feature indices to new ones.
+     *
+     * @param object $features full feature set
+     * @return array associative array mapping a subset of the original feature
+     *  indices to new indices
+     */
+    function select(Features $features)
+    {
+        $n = $features->numFeatures();
+        $selected = new SplMinHeap();
+        $allowed = isset($this->max) ? min($this->max, $n) : $n;
+        $labels = array(-1, 1);
+
+        /*
+           Start with 1, since 0 is dedicated to the constant intercept term;
+           <= $n because n is the last feature.
+         */
+        for ($j = 1; $j <= $n; $j++) {
+            $max_chi2 = 0.0;
+            foreach ($labels as $label) {
+                /*
+                   t = term present
+                   l = document has label
+                   n = negation
+                 */
+                $stats = $features->varStats($j, $label);
+                list($t_l, $t_nl, $nt_l, $nt_nl) = $stats;
+                $num = ($t_l * $nt_nl) - ($t_nl * $nt_l);
+                $den = ($t_l + $t_nl) * ($nt_l + $nt_nl);
+                $chi2 = $den != 0 ? ($num * $num) / $den : INF;
+                if ($chi2 > $max_chi2) {
+                    $max_chi2 = $chi2;
+                }
+            }
+
+            /*
+               Keep track of top features in a heap, as we compute
+               informativeness.
+             */
+            if ($allowed > 0) {
+                $selected->insert(array($max_chi2, $j));
+                $allowed -= 1;
+            } else {
+                list($other_chi2, $_) = $selected->top();
+                if ($max_chi2 > $other_chi2) {
+                    $selected->extract();
+                    $selected->insert(array($max_chi2, $j));
+                }
+            }
+        }
+
+        return $this->buildMap($selected);
+    }
+}
+?>
\ No newline at end of file
diff --git a/lib/classifiers/features.php b/lib/classifiers/features.php
new file mode 100644
index 000000000..5aa3aa392
--- /dev/null
+++ b/lib/classifiers/features.php
@@ -0,0 +1,571 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage classifier
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+/**
+ * Manages a dataset's features, providing a standard interface for converting
+ * documents to feature vectors, and for accessing feature statistics.
+ *
+ * Each document in the training set is expected to be fed through an instance
+ * of a subclass of this abstract class in order to convert it to a feature
+ * vector. Terms are replaced with feature indices (e.g., 'Pythagorean' => 1,
+ * 'theorem' => 2, and so on), which are contiguous. The value at a feature
+ * index is determined by the subclass; one might weight terms according to how
+ * often they occur in the document, while another might use a simple binary
+ * representation. The feature index 0 is reserved for an intercept term, which
+ * always has a value of one.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+abstract class Features
+{
+    /**
+     * Maps terms to their feature indices, which start at 1.
+     * @var array
+     */
+    var $vocab = array();
+
+    /**
+     * Maps terms to how often they occur in documents by label.
+     * @var array
+     */
+    var $var_freqs = array();
+
+    /**
+     * Maps labels to the number of documents they're assigned to.
+     * @var array
+     */
+    var $label_freqs = array(-1 => 0, 1 => 0);
+
+    /**
+     * Maps old feature indices to new ones when a feature subset operation has
+     * been applied to restrict the number of features.
+     * @var array
+     */
+    var $feature_map;
+
+    /**
+     * A list of the top terms according to the last feature subset operation,
+     * if any.
+     * @var array
+     */
+    var $top_terms = array();
+
+    /**
+     * Maps a new example to a feature vector, adding any new terms to the
+     * vocabulary, and updating term and label statistics. The example should
+     * be an array of terms and their counts, and the output simply replaces
+     * terms with feature indices.
+     *
+     * @param array $terms array of terms mapped to the number of times they
+     *  occur in the example
+     * @param int $label label for this example, either -1 or 1
+     * @return array input example with terms replaced by feature indices
+     */
+    function addExample($terms, $label)
+    {
+        $this->label_freqs[$label]++;
+        $features = array();
+        foreach ($terms as $term => $count) {
+            if (isset($this->vocab[$term])) {
+                $j = $this->vocab[$term];
+            } else {
+                // Var indices start at 1 to accommodate the intercept at 0.
+                $j = count($this->vocab) + 1;
+                $this->vocab[$term] = $j;
+            }
+            $features[$j] = $count;
+            // Update term statistics
+            if (!isset($this->var_freqs[$j][$label])) {
+                $this->var_freqs[$j][$label] = 1;
+            } else {
+                $this->var_freqs[$j][$label]++;
+            }
+        }
+        // Feature 0 is an intercept term
+        $features[0] = 1;
+        ksort($features);
+        return $features;
+    }
+
+    /**
+     * Updates the label and term statistics to reflect a label change for an
+     * example from the training set. A new label of 0 indicates that the
+     * example is being removed entirely. Note that term statistics only count
+     * one occurrence of a term per example.
+     *
+     * @param array $features feature vector from when the example was
+     *  originally added
+     * @param int $old_label old example label in {-1, 1}
+     * @param int $new_label new example label in {-1, 0, 1}, where 0 indicates
+     *  that the example should be removed entirely
+     */
+    function updateExampleLabel($features, $old_label, $new_label)
+    {
+        $this->label_freqs[$old_label]--;
+        if ($new_label != 0) {
+            $this->label_freqs[$new_label]++;
+        }
+        // Remove the intercept term first.
+        unset($features[0]);
+        foreach (array_keys($features) as $j) {
+            $this->var_freqs[$j][$old_label]--;
+            if ($new_label != 0) {
+                $this->var_freqs[$j][$new_label]++;
+            }
+        }
+    }
+
+    /**
+     * Returns the number of features, not including the intercept term
+     * represented by feature zero. For example, if we had features 0..10,
+     * this function would return 10.
+     *
+     * @return int the number of features in the training set
+     */
+    function numFeatures()
+    {
+        return count($this->vocab);
+    }
+
+    /**
+     * Returns the positive and negative label counts for the training set.
+     *
+     * @return array positive and negative label counts indexed by label,
+     *  either 1 or -1
+     */
+    function labelStats()
+    {
+        return array($this->label_freqs[1], $this->label_freqs[-1]);
+    }
+
+    /**
+     * Returns the statistics for a particular feature and label in the
+     * training set. The statistics are counts of how often the term appears or
+     * fails to appear in examples with or without the target label. They are
+     * returned in a flat array, in the following order:
+     *
+     *     0 => # examples where feature present, label matches
+     *     1 => # examples where feature present, label doesn't match
+     *     2 => # examples where feature absent, label matches
+     *     3 => # examples where feature absent, label doesn't match
+     *
+     * @param int $j feature index
+     * @param int $label target label
+     * @return array feature statistics in 4-element flat array
+     */
+    function varStats($j, $label)
+    {
+        $tl = isset($this->var_freqs[$j][$label]) ?
+            $this->var_freqs[$j][$label] : 0;
+        $t  = array_sum($this->var_freqs[$j]);
+        $l  = $this->label_freqs[$label];
+        $N  = array_sum($this->label_freqs);
+        return array(
+            $tl,               //  t and  l
+            $t - $tl,          //  t and ~l
+            $l - $tl,          // ~t and  l
+            $N - $t - $l + $tl // ~t and ~l
+        );
+    }
+
+    /**
+     * Given a FeatureSelection instance, return a new clone of this Features
+     * instance using a restricted feature subset. The new Features instance
+     * is augmented with a feature map that it can use to convert feature
+     * indices from the larger feature set to indices for the reduced set.
+     *
+     * @param object $fs FeatureSelection instance to be used to select the
+     *  most informative terms
+     * @return object new Features instance using the restricted feature set
+     */
+    function restrict(FeatureSelection $fs)
+    {
+        $feature_map = $fs->select($this);
+        /*
+           Collect the top few most-informative features (if any). The features
+           are inserted into the feature map by decreasing informativeness, so
+           iterating through from the beginning will yield the most informative
+           features first, excepting the very first one, which is guaranteed to
+           be the intercept term.
+         */
+        $top_features = array();
+        next($feature_map);
+        for ($i = 0; $i < 5; $i++) {
+            if (!(list($j) = each($feature_map))) {
+                break;
+            }
+            $top_features[$j] = true;
+        }
+        $classname = get_class($this);
+        $new_features = new $classname;
+        foreach ($this->vocab as $term => $old_j) {
+            if (isset($feature_map[$old_j])) {
+                $new_j = $feature_map[$old_j];
+                $new_features->vocab[$term] = $new_j;
+                $new_features->var_freqs[$new_j] = $this->var_freqs[$old_j];
+                // Get the actual term associated with a top feature.
+                if (isset($top_features[$old_j])) {
+                    $top_features[$old_j] = $term;
+                }
+            }
+        }
+        $new_features->label_freqs = $this->label_freqs;
+        $new_features->feature_map = $feature_map;
+        // Note that this preserves the order of top features.
+        $new_features->top_terms = array_values($top_features);
+        return $new_features;
+    }
+
+    /**
+     * Maps the indices of a feature vector to those used by a restricted
+     * feature set, dropping and features that aren't in the map. If this
+     * Features instance isn't restricted, then the passed-in features are
+     * returned unmodified.
+     *
+     * @param array $features feature vector mapping feature indices to
+     *  frequencies
+     * @return array original feature vector with indices mapped
+     *  according to the feature_map property, and any features that don't
+     *  occcur in feature_map dropped
+     */
+    function mapToRestrictedFeatures($features)
+    {
+        if (empty($this->feature_map)) {
+            return $features;
+        }
+        $mapped_features = array();
+        foreach ($features as $j => $count) {
+            if (isset($this->feature_map[$j])) {
+                $mapped_features[$this->feature_map[$j]] = $count;
+            }
+        }
+        return $mapped_features;
+    }
+
+    /**
+     * Given an array of feature vectors mapping feature indices to counts,
+     * returns a sparse matrix representing the dataset transformed according
+     * to the specific Features subclass. A Features subclass might use simple
+     * binary features, but it might also use some form of TF * IDF, which
+     * requires the full dataset in order to assign weights to particular
+     * document features; thus the necessity of a map over the entire training
+     * set prior to its input to a classification algorithm.
+     *
+     * @param array $docs array of training examples represented as feature
+     *  vectors where the values are per-example counts
+     * @return object SparseMatrix instance whose rows are the transformed
+     *  feature vectors
+     */
+    abstract function mapTrainingSet($docs);
+
+    /**
+     * Maps a vector of terms mapped to their counts within a single document
+     * to a transformed feature vector, exactly like a row in the sparse matrix
+     * returned by mapTrainingSet. This method is used to transform a tokenized
+     * document prior to classification.
+     *
+     * @param array $tokens associative array of terms mapped to their
+     *  within-document counts
+     * @return array feature vector corresponding to the tokens, mapped
+     *  according to the implementation of a particular Features subclass
+     */
+    abstract function mapDocument($tokens);
+}
+
+
+/**
+ * A concrete Features subclass that represents a document as a binary
+ * vector where a one indicates that a feature is present in the document, and
+ * a zero indicates that it is not. The absent features are ignored, so the
+ * binary vector is actually sparse, containing only those feature indices
+ * where the value is one.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+class BinaryFeatures extends Features
+{
+    /**
+     * Replaces term counts with 1, indicating only that a feature occurs in a
+     * document.  When a Features instance is a subset of a larger instance, it
+     * will have a feature_map member that maps feature indices from the larger
+     * feature set to the smaller one. The indices must be mapped in this way
+     * so that the training set can retain complete information, only throwing
+     * away features just before training. See the abstract parent class for a
+     * more thorough introduction to the interface.
+     *
+     * @param array $docs array of training examples represented as feature
+     *  vectors where the values are per-example counts
+     * @return object SparseMatrix instance whose rows are the transformed
+     *  feature vectors
+     */
+    function mapTrainingSet($docs)
+    {
+        $m = count($docs);
+        $n = count($this->vocab) + 1;
+        $X = new SparseMatrix($m, $n);
+
+        $i = 0;
+        foreach ($docs as $features) {
+            /*
+               If this is a restricted feature set, map from the expanded
+               feature set first, potentially dropping features.
+             */
+            $features = $this->mapToRestrictedFeatures($features);
+            $new_features = array_combine(
+                array_keys($features),
+                array_fill(0, count($features), 1));
+            $X->setRow($i++, $new_features);
+        }
+
+        return $X;
+    }
+
+    /**
+     * Converts a map from terms to  within-document term counts with the
+     * corresponding sparse binary feature vector used for classification.
+     *
+     * @param array $tokens associative array of terms mapped to their
+     *  within-document counts
+     * @return array feature vector corresponding to the tokens, mapped
+     *  according to the implementation of a particular Features subclass
+     */
+    function mapDocument($tokens)
+    {
+        $x = array();
+        foreach ($tokens as $token => $count) {
+            if (isset($this->vocab[$token])) {
+                $x[$this->vocab[$token]] = 1;
+            }
+        }
+        $x[0] = 1;
+        ksort($x);
+        return $x;
+    }
+}
+
+
+/**
+ * A concrete Features subclass that represents a document as a
+ * vector of feature weights, where weights are computed using a modified form
+ * of TF * IDF. This feature mapping is experimental, and may not work
+ * correctly.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+class WeightedFeatures extends Features
+{
+    var $D = 0;
+    var $n = array();
+
+    function mapTrainingSet($docs)
+    {
+        $m = count($this->examples);
+        $n = count($this->vocab);
+
+        $this->D = $m;
+        $this->n = array();
+
+        // Fill in $n, the count of documents that contain each term
+        foreach ($this->examples as $features) {
+            foreach (array_keys($features) as $j) {
+                if (!isset($this->n[$j]))
+                    $this->n[$j] = 1;
+                else
+                    $this->n[$j] += 1;
+            }
+        }
+
+        $X = new SparseMatrix($m, $n);
+        $y = $this->exampleLabels;
+
+        foreach ($this->examples as $i => $features) {
+            $u = array();
+            $sum = 0;
+
+            // First compute the unnormalized TF * IDF term weights and keep
+            // track of the sum of all weights in the document.
+            foreach ($features as $j => $count) {
+                $tf = 1 + log($count);
+                $idf = log(($this->D + 1) / ($this->n[$j] + 1));
+                $weight = $tf * $idf;
+                $u[$j] = $weight;
+                $sum += $weight * $weight;
+            }
+
+            // Now normalize each of the term weights.
+            $norm = sqrt($sum);
+            foreach (array_keys($features) as $j) {
+                $features[$j] = $u[$j] / $norm;
+            }
+            $X->setRow($i, $features);
+        }
+
+        return array($X, $y);
+    }
+
+    function mapDocument($tokens)
+    {
+        $u = array();
+        $sum = 0;
+
+        ksort($this->current);
+
+        foreach ($this->current as $j => $count) {
+            $tf = 1 + log($count);
+            $idf = log(($this->D + 1) / ($this->n[$j] + 1));
+            $weight = $tf * $idf;
+            $u[$j] = $weight;
+            $sum += $weight * $weight;
+        }
+
+        $norm = sqrt($sum);
+        $x = array();
+        foreach (array_keys($this->current) as $j) {
+            $x[$j] = $u[$j] / $norm;
+        }
+
+        $this->current = array();
+        return $x;
+    }
+}
+
+
+/**
+ * A sparse matrix implementation based on an associative array of associative
+ * arrays.
+ *
+ * A SparseMatrix is mostly a wrapper around an array of arrays, but it keeps
+ * track of some extra information such as the true matrix dimensions, and the
+ * number of non-zero entries. It also provides a convenience method for
+ * partitioning the matrix rows into two new sparse matrices.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+class SparseMatrix implements Iterator
+{
+    /**
+     * The number of rows, regardless of whether or not some are empty.
+     * @var int
+     */
+    var $m;
+
+    /**
+     * The number of columns, regardless of whether or not some are empty.
+     * @var int
+     */
+    var $n;
+
+    /**
+     * The number of non-zero entries.
+     * @var int
+     */
+    var $nonzero = 0;
+
+    /**
+     * The actual matrix data, an associative array mapping row indices to
+     * associative arrays mapping column indices to their values.
+     * @var array
+     */
+    var $data;
+
+    /**
+     * Initializes a new sparse matrix with specific dimensions.
+     *
+     * @param int $m number of rows
+     * @param int $n number of columns
+     */
+    function __construct($m, $n)
+    {
+        $this->m = $m;
+        $this->n = $n;
+        $this->data = array();
+    }
+
+    function rows()    { return $this->m; }
+    function columns() { return $this->n; }
+    function nonzero() { return $this->nonzero; }
+
+    /**
+     * Sets a particular row of data, keeping track of any new non-zero
+     * entries.
+     *
+     * @param int $i row index
+     * @param array $row associative array mapping column indices to values
+     */
+    function setRow($i, $row)
+    {
+        $this->data[$i] = $row;
+        $this->nonzero += count($row);
+    }
+
+    /**
+     * Given two sets of row indices, returns two new sparse matrices
+     * consisting of the corresponding rows.
+     *
+     * @param array $a_indices row indices for first new sparse matrix
+     * @param array $b_indices row indices for second new sparse matrix
+     * @return array array with two entries corresponding to the first and
+     *  second new matrices
+     */
+    function partition($a_indices, $b_indices)
+    {
+        $a = new SparseMatrix(count($a_indices), $this->n);
+        $b = new SparseMatrix(count($b_indices), $this->n);
+        $new_i = 0;
+        foreach ($a_indices as $i) {
+            $a->setRow($new_i++, $this->data[$i]);
+        }
+        $new_i = 0;
+        foreach ($b_indices as $i) {
+            $b->setRow($new_i++, $this->data[$i]);
+        }
+        return array($a, $b);
+    }
+
+    /* Iterator Interface */
+
+    function rewind() { reset($this->data); }
+    function current() { return current($this->data); }
+    function key() { return key($this->data); }
+    function next() { return next($this->data); }
+    function valid() { return !is_null(key($this->data)); }
+}
+?>
\ No newline at end of file
diff --git a/lib/classifiers/lasso_regression.php b/lib/classifiers/lasso_regression.php
new file mode 100644
index 000000000..52da44901
--- /dev/null
+++ b/lib/classifiers/lasso_regression.php
@@ -0,0 +1,429 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage classifier
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/** Base class definition */
+require_once BASE_DIR."/lib/classifiers/classifier_algorithm.php";
+
+/**
+ * Implements the logistic regression text classification algorithm using lasso
+ * regression and a cyclic coordinate descent optimization step.
+ *
+ * This algorithm is rather slow to converge for large datasets or a large
+ * number of features, but it does provide regularization in order to combat
+ * over-fitting, and out-performs Naive-Bayes in tests on the same data set.
+ * The algorithm augments a standard cyclic coordinate descent approach by
+ * ``sleeping'' features that don't significantly change during a single step.
+ * Each time an optimization step for a feature doesn't change the feature
+ * weight beyond some threshold, that feature is forced to sit out the next
+ * optimization round. The threshold increases over successive rounds,
+ * effectively placing an upper limit on the number of iterations over all
+ * features, while simultaneously limiting the number of features updated on
+ * each round. This optimization speeds up convergence, but at the cost of some
+ * accuracy.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+class LassoRegression extends ClassifierAlgorithm
+{
+    /**
+     * Level of detail to be used for logging. Higher values mean more detail.
+     * @var int
+     */
+    var $debug = 0;
+
+    /**
+     * Threshold used to determine convergence.
+     * @var float
+     */
+    var $epsilon = 0.001;
+
+    /**
+     * Lambda parameter to CLG algorithm.
+     * @var float
+     */
+    var $lambda = 1.0;
+
+    /**
+     * Beta vector of feature weights resulting from the training phase. The
+     * dot product of this vector with a feature vector yields the log
+     * likelihood that the feature vector describes a document belonging to the
+     * trained-for class.
+     * @var array
+     */
+    var $beta;
+
+    /**
+     * An adaptation of the Zhang-Oles 2001 CLG algorithm by Genkin et al. to
+     * use the Laplace prior for parameter regularization. On completion,
+     * optimizes the beta vector to maximize the likelihood of the data set.
+     *
+     * @param object $X SparseMatrix representing the training dataset
+     * @param array $y array of known labels corresponding to the rows of $X
+     */
+    function train($X, $y)
+    {
+        $invX = new InvertedData($X);
+        $this->lambda = $this->estimateLambdaNorm($invX);
+        $m = $invX->rows();
+        $n = $invX->columns();
+        $this->beta = array_fill(0, $n, 0.0);
+        $beta =& $this->beta;
+        $lambda = $this->lambda;
+        $d = array_fill(0, $n, 1.0);
+        $r = array_fill(0, $m, 0.0);
+        $converged = false;
+
+        $drSum = 0.0;
+        $rSum = 0.0;
+        $change = 0.0;
+        $score = 0.0;
+
+        $minDrj = $this->epsilon;
+        $prevDrj = $this->epsilon;
+        $schedule = new SplMaxHeap();
+        $nextSchedule = new SplMaxHeap();
+
+        for ($j = 0; $j < $n; $j++)
+            $schedule->insert(array($this->epsilon, $j));
+
+        for ($k = 0; !$converged; $k++) {
+            $prevR = $r;
+
+            $var = 1;
+            while (!$schedule->isEmpty()) {
+                list($drj, $j) = $schedule->top();
+
+                if ($drj < $minDrj /*|| $drj / $prevDrj < 0.25*/) {
+                    break;
+                } else {
+                    $schedule->extract();
+                    $prevDrj = $drj;
+                }
+
+                $Xj = $invX->iterateColumn($j);
+
+                list($numer, $denom) = $this->computeApproxLikelihood(
+                    $Xj, $y, $r, $d[$j]);
+
+                // Compute tentative step $dvj
+                if ($beta[$j] == 0) {
+                    $dvj = ($numer - $lambda) / $denom;
+                    if ($dvj <= 0) {
+                        $dvj = ($numer + $lambda) / $denom;
+                        if ($dvj >= 0)
+                            $dvj = 0;
+                    }
+                } else {
+                    $s = $beta[$j] > 0 ? 1 : -1;
+                    $dvj = ($numer - ($s * $lambda)) / $denom;
+                    if ($s * ($beta[$j] + $dvj) < 0)
+                        $dvj = -$beta[$j];
+                }
+
+                if ($dvj == 0) {
+                    $d[$j] /= 2;
+                    $nextSchedule->insert(array($this->epsilon, $j, $k));
+                } else {
+                    // Compute delta for beta[j], constrained to trust region.
+                    $dbetaj = min(max($dvj, -$d[$j]), $d[$j]);
+
+                    // Update our cached dot product by the delta.
+                    $drj = 0.0;
+                    foreach ($Xj as $cell) {
+                        list($_, $i, $Xij) = $cell;
+                        $dr = $dbetaj * $Xij;
+                        $drj += $dr;
+                        $r[$i] += $dr;
+                    }
+
+                    $drj = abs($drj);
+                    $nextSchedule->insert(array($drj, $j, $k));
+
+                    $beta[$j] += $dbetaj;
+
+                    // Update the trust region.
+                    $d[$j] = max(2 * abs($dbetaj), $d[$j] / 2);
+                }
+
+                if ($this->debug > 1) {
+                    $score = $this->score($r, $y, $beta);
+                }
+
+                $this->log(sprintf(
+                    "itr = %3d, j = %4d (#%d), score = %6.2f, change = %6.4f",
+                    $k + 1, $j, $var, $score, $change));
+
+                $var++;
+            }
+
+            // Update $converged
+
+            $drSum = 0.0;
+            $rSum = 0.0;
+            for ($i = 0; $i < $m; $i++) {
+                $drSum += abs($r[$i] - $prevR[$i]);
+                $rSum += abs($r[$i]);
+            }
+            $change = $drSum / (1 + $rSum);
+
+            $converged = $change <= $this->epsilon;
+
+            while (!$schedule->isEmpty()) {
+                list($drj, $j) = $schedule->extract();
+                $nextSchedule->insert(array($drj * 4, $j));
+            }
+
+            $tmp = $schedule;
+            $schedule = $nextSchedule;
+            $nextSchedule = $tmp;
+
+            $minDrj *= 2;
+        }
+    }
+
+    /**
+     * Returns the pseudo-probability that a new instance is a positive example
+     * of the class the beta vector was trained to recognize. It only makes
+     * sense to try classification after at least some training
+     * has been done on a dataset that includes both positive and negative
+     * examples of the target class.
+     *
+     * @param array $x feature vector represented by an associative array
+     *  mapping features to their weights
+     */
+    function classify($x)
+    {
+        $l = 0.0;
+        foreach ($x as $j => $xj) {
+            $l += $xj * $this->beta[$j];
+        }
+        return 1.0 / (1.0 + exp(-$l));
+    }
+
+    /* PRIVATE INTERFACE */
+
+    /**
+     * Computes the approximate likelihood of y given a single feature, and
+     * returns it as a pair <numerator, denominator>.
+     *
+     * @param object $Xj iterator over the non-zero entries in column j of the
+     *  data
+     * @param array $y labels corresponding to entries in $Xj; each label is 1
+     *  if example i has the target label, and -1 otherwise
+     * @param array $r cached dot products of the beta vector and feature
+     *  weights for each example i
+     * @param float $d trust region for feature j
+     * @return array two-element array containing the numerator and denominator
+     *  of the likelihood
+     */
+    function computeApproxLikelihood($Xj, $y, $r, $d)
+    {
+        $numer = 0.0;
+        $denom = 0.0;
+
+        foreach ($Xj as $cell) {
+            list($j, $i, $Xij) = $cell;
+
+            $yi = $y[$i];
+            $ri = $yi * $r[$i];
+            $a = abs($ri);
+            $b = abs($d * $Xij);
+            if ($a <= $b) {
+                $F = 0.25;
+            } else {
+                $e = exp($a - $b);
+                $F = 1.0 / (2.0 + $e + (1.0/$e));
+            }
+            $numer += $Xij * $yi / (1 + exp($ri));
+            $denom += $Xij * $Xij * $F;
+        }
+
+        return array($numer, $denom);
+    }
+
+    /**
+     * Computes an approximate score that can be used to get an idea of how
+     * much a given optimization step improved the likelihood of the data set.
+     *
+     * @param array $r cached dot products of the beta vector and feature
+     *  weights for each example i
+     * @param array $y labels for each example
+     * @param array $beta beta vector of feature weights (used to
+     *  penalize large weights)
+     * @return float value proportional to the likelihood of the data,
+     *  penalized by the magnitude of the beta vector
+     */
+    function score($r, $y, $beta)
+    {
+        $score = 0;
+        foreach ($r as $i => $ri)
+            $score += -log(1 + exp(-$ri * $y[$i]));
+        return $score - array_sum($beta);
+    }
+
+    /**
+     * Estimates the lambda parameter from the dataset.
+     *
+     * @param object $invX inverted X matrix for dataset (essentially a posting
+     *  list of features in X)
+     * @return float lambda estimate
+     */
+    function estimateLambdaNorm($invX)
+    {
+        $sqNorm = 0;
+        foreach ($invX->iterateData() as $entry) {
+            $Xij = $entry[2];
+            $sqNorm += $Xij * $Xij;
+        }
+
+        $m = $invX->rows();
+        $n = $invX->columns();
+        $sigmaSq = $n * $m / $sqNorm;
+        return sqrt(2) / sqrt($sigmaSq);
+    }
+}
+
+/**
+ * Stores a data matrix in an inverted index on columns with non-zero entries.
+ *
+ * The index is just an array of entries <j, i, X[i][j]> sorted first by j and
+ * then by i, where all X[i][j] > 0. Provides a method to iterate over all rows
+ * which have a non-zero entry for a particular column (feature) j. There is
+ * no efficient way to iterate over rows in order.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+class InvertedData
+{
+    /**
+     * Number of rows in the matrix.
+     * @var int
+     */
+    var $rows;
+
+    /**
+     * Number of columns in the matrix.
+     * @var int
+     */
+    var $columns;
+
+    /**
+     * Array of non-zero matrix entries.
+     * @var array
+     */
+    var $data;
+
+    /**
+     * Array of offsets into the $data array, where each offset gives the start
+     * of the entries for a particular feature.
+     * @var array
+     */
+    var $index;
+
+    /**
+     * Converts a SparseMatrix into an InvertedData instance. The data is
+     * duplicated.
+     *
+     * @param object $X SparseMatrix instance to convert
+     */
+    function __construct(SparseMatrix $X)
+    {
+        $this->rows = $X->rows();
+        $this->columns = $X->columns();
+        $this->data = array();
+        $this->index = array();
+
+        foreach ($X as $i => $row) {
+            foreach ($row as $j => $Xij) {
+                $this->data[] = array($j, $i, $Xij);
+            }
+        }
+
+        sort($this->data);
+
+        $lastVar = -1;
+        foreach ($this->data as $dataOffset => $x) {
+            $currVar = $x[0];
+            if ($currVar != $lastVar) {
+                for ($var = $lastVar + 1; $var <= $currVar; $var++)
+                    $this->index[$var] = $dataOffset;
+                $lastVar = $currVar;
+            }
+        }
+    }
+
+    function rows()    { return $this->rows; }
+    function columns() { return $this->columns; }
+
+    /**
+     * Returns an iterator over the values for a particular column of the
+     * matrix. If no matrix entry in the column is non-zero then an empty
+     * iterator is returned.
+     *
+     * @param into $j feature index (column) to iterate over
+     * @return object iterator over values in the column
+     */
+    function iterateColumn($j)
+    {
+        $start = $this->index[$j];
+        if ($j < count($this->index) - 1)
+            $count = $this->index[$j + 1] - $start;
+        else
+            $count = -1;
+
+        if ($count != 0) {
+            $arrItr = new ArrayIterator($this->data);
+            return new LimitIterator($arrItr, $start, $count);
+        }
+
+        return new EmptyIterator();
+    }
+
+    /**
+     * Returns an iterator over the entire matrix. Note that this iterator is
+     * not in row order, but effectively in column order.
+     *
+     * @return object iterator over every non-zero entry in the matrix
+     */
+    function iterateData()
+    {
+        return new ArrayIterator($this->data);
+    }
+}
+?>
diff --git a/lib/classifiers/naive_bayes.php b/lib/classifiers/naive_bayes.php
new file mode 100644
index 000000000..39cd9ffaf
--- /dev/null
+++ b/lib/classifiers/naive_bayes.php
@@ -0,0 +1,201 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage classifier
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/** Base class definition */
+require_once BASE_DIR."/lib/classifiers/classifier_algorithm.php";
+
+/**
+ * Implements the Naive Bayes text classification algorithm.
+ *
+ * This class also provides a method to sample a beta vector from a dataset,
+ * making it easy to generate several slightly-different classifiers for the
+ * same dataset in order to form classifier committees.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage classifier
+ */
+class NaiveBayes extends ClassifierAlgorithm
+{
+    /**
+     * Parameter used to weight positive examples.
+     * @var float
+     */
+    var $gamma = 1.0;
+
+    /**
+     * Parameter used to weight negative examples.
+     * @var float
+     */
+    var $epsilon = 1.0;
+
+    /**
+     * Beta vector of feature weights resulting from the training phase. The
+     * dot product of this vector with a feature vector yields the log
+     * likelihood that the feature vector describes a document belonging to the
+     * trained-for class.
+     * @var array
+     */
+    var $beta;
+
+    /**
+     * Computes the beta vector from the given examples and labels. The
+     * examples are represented as a sparse matrix where each row is an example
+     * and each column a feature, and the labels as an array where each value
+     * is either 1 or -1, corresponding to a positive or negative example. Note
+     * that the first feature (column 0) corresponds to an intercept term, and
+     * is equal to 1 for every example.
+     *
+     * @param object $X SparseMatrix of training examples
+     * @param array $y example labels
+     */
+    function train(SparseMatrix $X, $y)
+    {
+        $n = $X->columns();
+        $p = array_fill(0, $n, 0);
+        $a = array_fill(0, $n, 0);
+        $this->beta = array_fill(0, $n, 0.0);
+        $beta =& $this->beta;
+
+        foreach ($X as $i => $row) {
+            foreach ($row as $j => $Xij) {
+                if ($y[$i] == 1) {
+                    $p[$j] += 1;
+                } else {
+                    $a[$j] += 1;
+                }
+            }
+        }
+
+        $beta[0] = $this->logit($p[0], $a[0]);
+        for ($j = 1; $j < $n; $j++) {
+            $beta[$j] = $this->logit($p[$j], $a[$j]) - $beta[0];
+        }
+    }
+
+    /**
+     * Constructs beta by sampling from the Gamma distribution for each
+     * feature, parameterized by the number of times the feature appears in
+     * positive examples, with a scale/rate of 1. This function is used to
+     * construct classifier committees.
+     *
+     * @param object $features Features instance for the training set, used to
+     *  determine how often a given feature occurs in positive and negative
+     *  examples
+     */
+    function sampleBeta($features)
+    {
+        $p = array();
+        $a = array();
+        $n = $features->numFeatures();
+        list($p[0], $a[0]) = $features->labelStats();
+        for ($j = 1; $j <= $n; $j++) {
+            $stats = $features->varStats($j, 1);
+            list($t_l, $t_nl, $nt_l, $nt_nl) = $stats;
+            $p[$j] = $this->sampleGammaDeviate(1 + $t_l);
+            $a[$j] = $this->sampleGammaDeviate(1 + $t_nl);
+        }
+
+        $this->beta = array();
+        $beta =& $this->beta;
+        $beta[0] = $this->logit($p[0], $a[0]);
+        for ($j = 1; $j <= $n; $j++) {
+            $beta[$j] = $this->logit($p[$j], $a[$j]) - $beta[0];
+        }
+    }
+
+    /**
+     * Returns the pseudo-probability that a new instance is a positive example
+     * of the class the beta vector was trained to recognize. It only makes
+     * sense to try classification after at least some training
+     * has been done on a dataset that includes both positive and negative
+     * examples of the target class.
+     *
+     * @param array $x feature vector represented by an associative array
+     *  mapping features to their weights
+     */
+    function classify($x)
+    {
+        $beta =& $this->beta;
+        $l = 0.0;
+        foreach ($x as $j => $xj) {
+            /*
+               The $x values are in {-1,1} instead of {0,1}, so we just
+               manually skip what would be the zero terms.
+            */
+            if ($xj == 1)
+                $l += $beta[$j];
+        }
+        return 1.0 / (1.0 + exp(-$l));
+    }
+
+    /* PRIVATE INTERFACE */
+
+    /**
+     * Computes the log odds of a numerator and denominator, corresponding to
+     * the number of positive and negative examples exhibiting some feature.
+     *
+     * @param int $pos count of positive examples exhibiting some feature
+     * @param int $neg count of negative examples
+     * @return float log odds of seeing the feature in a positive example
+     */
+    function logit($pos, $neg)
+    {
+        $odds = ($pos + $this->gamma) / ($neg + $this->epsilon);
+        return log($odds);
+    }
+
+    /**
+     * Computes a Gamma deviate with beta = 1 and integral, small alpha. With
+     * these assumptions, the deviate is just the sum of alpha exponential
+     * deviates. Each exponential deviate is just the negative log of a uniform
+     * deviate, so the sum of the logs is just the negative log of the products
+     * of the uniform deviates.
+     *
+     * @param int $alpha parameter to Gamma distribution (in practice, a count
+     *  of occurrences of some feature)
+     * @return float a deviate from the Gamma distribution parameterized by
+     *  $alpha
+     */
+    function sampleGammaDeviate($alpha)
+    {
+        $product = 1.0;
+        $randmax = getrandmax();
+        for ($i = 0; $i < $alpha; $i++) {
+            $product *= rand() / $randmax;
+        }
+        return -log($product);
+    }
+}
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index 3b857c390..140155cc8 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -219,8 +219,9 @@ interface CrawlConstants
     const UI_FLAGS = 'cr';
     const KEYWORD_LINKS = 'cs';
     const END_ITERATOR = 'ct';
+    const ACTIVE_CLASSIFIERS = 'cu';
+    const ACTIVE_CLASSIFIERS_DATA = 'cv';

     const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;
-
 }
 ?>
diff --git a/lib/upgrade_functions.php b/lib/upgrade_functions.php
index 50fa64fd9..655b40a4f 100644
--- a/lib/upgrade_functions.php
+++ b/lib/upgrade_functions.php
@@ -76,7 +76,7 @@ function upgradeDatabaseWorkDirectoryCheck()
         $result = @$model->db->execute($sql);
         if($result !== false) {
             $row = $model->db->fetchArray($result);
-            if(isset($row['ID']) && $row['ID'] >= 14) {
+            if(isset($row['ID']) && $row['ID'] >= 16) {
                 return false;
             } else {
                 return true;
@@ -94,7 +94,7 @@ function upgradeDatabaseWorkDirectoryCheck()
  */
 function upgradeDatabaseWorkDirectory()
 {
-    $versions = array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15);
+    $versions = array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16);
     $model = new Model();
     $model->db->selectDB(DB_NAME);
     $sql = "SELECT ID FROM VERSION";
@@ -230,6 +230,7 @@ function upgradeDatabaseVersion3(&$db)
         'Sắp xếp hoạt động dựa theo hoạch định')");

 }
+
 /**
  * Upgrades a Version 3 version of the Yioop! database to a Version 4 version
  * @param object $db datasource to use to upgrade
@@ -477,7 +478,75 @@ function upgradeDatabaseVersion15(&$db)
         AND GROUP_ID=0");
     $db->execute("INSERT INTO MIX_COMPONENTS VALUES(
         3, 0, 1, 1, 'media:video site:doc')");
-    $db->execute("INSERT INTO LOCALE VALUES (21, 'te', 'తెలుగు', 'lr-tb')");
+    $db->execute("INSERT INTO LOCALE VALUES (21, 'te',
+        'తెలుగు', 'lr-tb')");
+    upgradeLocales();
+}
+
+/**
+ * Upgrades a Version 15 version of the Yioop! database to a Version 16 version
+ * @param object $db datasource to use to upgrade
+ */
+function upgradeDatabaseVersion16(&$db)
+{
+    $db->execute("DELETE FROM VERSION WHERE ID < 15");
+    $db->execute("UPDATE VERSION SET ID=16 WHERE ID=15");
+
+    $db->execute("INSERT INTO ROLE_ACTIVITY VALUES (1, 12)");
+
+    $db->execute("UPDATE ACTIVITY
+        SET ACTIVITY_ID = ACTIVITY_ID + 100,
+            TRANSLATION_ID = TRANSLATION_ID + 100
+        WHERE ACTIVITY_ID > 5 AND ACTIVITY_ID < 1000");
+
+    $db->execute("INSERT INTO ACTIVITY
+        VALUES (6, 6, 'manageClassifiers')");
+
+    $db->execute("UPDATE ACTIVITY
+        SET ACTIVITY_ID = ACTIVITY_ID - 99,
+            TRANSLATION_ID = TRANSLATION_ID - 99
+        WHERE ACTIVITY_ID > 6 AND ACTIVITY_ID < 1000");
+
+    $db->execute("UPDATE sqlite_sequence SET seq = 12
+        WHERE name = 'ACTIVITY'");
+
+    $db->execute("UPDATE TRANSLATION
+        SET TRANSLATION_ID = TRANSLATION_ID + 100
+        WHERE TRANSLATION_ID > 5 AND TRANSLATION_ID < 1000");
+
+    $db->execute("INSERT INTO TRANSLATION
+        VALUES (6, 'db_activity_manage_classifiers')");
+
+    $db->execute("UPDATE TRANSLATION
+        SET TRANSLATION_ID = TRANSLATION_ID - 99
+        WHERE TRANSLATION_ID > 6 AND TRANSLATION_ID < 1000");
+
+    $db->execute("UPDATE TRANSLATION_LOCALE
+        SET TRANSLATION_ID = TRANSLATION_ID + 100
+        WHERE TRANSLATION_ID > 5 AND TRANSLATION_ID < 1000");
+
+    $db->execute("INSERT INTO TRANSLATION_LOCALE
+        VALUES (6, 1, 'Classifiers')");
+
+    $db->execute("UPDATE TRANSLATION_LOCALE
+        SET TRANSLATION_ID = TRANSLATION_ID - 99
+        WHERE TRANSLATION_ID > 6 AND TRANSLATION_ID < 1000");
+
+    $old_archives_path = WORK_DIRECTORY."/cache/archives";
+    $new_archives_path = WORK_DIRECTORY."/archives";
+    if (file_exists($old_archives_path)) {
+        rename($old_archives_path, $new_archives_path);
+    } else {
+        mkdir($new_archives_path);
+    }
+    $db->setWorldPermissionsRecursive($new_archives_path);
+
+    $new_classifiers_path = WORK_DIRECTORY."/classifiers";
+    if (!file_exists($new_classifiers_path)) {
+        mkdir($new_classifiers_path);
+    }
+    $db->setWorldPermissionsRecursive($new_classifiers_path);
+
     upgradeLocales();
 }
 ?>
diff --git a/locale/en-US/configure.ini b/locale/en-US/configure.ini
index fdd3e7cb7..33451b1a8 100755
--- a/locale/en-US/configure.ini
+++ b/locale/en-US/configure.ini
@@ -28,409 +28,472 @@
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//controllers
 ;
-; admin_controller.php line: 138
+; admin_controller.php line: 140
 admin_controller_login_successful = "Login Successful!!"
 ;
-; admin_controller.php line: 143
+; admin_controller.php line: 145
 admin_controller_login_failed = "Username or Password Incorrect!"
 ;
-; admin_controller.php line: 148
+; admin_controller.php line: 150
 admin_controller_login_to_config = "Login to continue configuration (default: u=root, p=)"
 ;
-; admin_controller.php line: 152
+; admin_controller.php line: 154
 admin_controller_status_updates_stopped = "Status updates have stopped."
 ;
-; admin_controller.php line: 333
+; admin_controller.php line: 337
 admin_controller_news_off = "Updates Off"
 ;
-; admin_controller.php line: 334
+; admin_controller.php line: 338
 admin_controller_news_update_web = "Web Update"
 ;
-; admin_controller.php line: 335
+; admin_controller.php line: 339
 admin_controller_news_process = "News Process"
 ;
-; admin_controller.php line: 362
+; admin_controller.php line: 366
 admin_controller_passwords_dont_match = "Typed passwords do not match."
 ;
-; admin_controller.php line: 374
+; admin_controller.php line: 378
 admin_controller_invalid_old_password = "Current password incorrect."
 ;
-; admin_controller.php line: 381
+; admin_controller.php line: 385
 admin_controller_change_password = "Password change successful!!"
 ;
-; admin_controller.php line: 414
+; admin_controller.php line: 418
 admin_controller_select_username = "Select Name"
 ;
-; admin_controller.php line: 451
+; admin_controller.php line: 455
 admin_controller_select_rolename = "Select Role"
 ;
-; admin_controller.php line: 477
+; admin_controller.php line: 481
 admin_controller_passwords_dont_match = "Typed passwords do not match."
 ;
-; admin_controller.php line: 484
+; admin_controller.php line: 488
 admin_controller_username_exists = "Cannot Create User As Username Exists"
 ;
-; admin_controller.php line: 491
+; admin_controller.php line: 495
 admin_controller_username_added = "User Created"
 ;
-; admin_controller.php line: 500
+; admin_controller.php line: 504
 admin_controller_username_doesnt_exists = "Username Does Not Exist"
 ;
-; admin_controller.php line: 507
+; admin_controller.php line: 511
 admin_controller_username_deleted = "User Deleted"
 ;
-; admin_controller.php line: 514
+; admin_controller.php line: 518
 admin_controller_username_doesnt_exists = "Username Does Not Exist"
 ;
-; admin_controller.php line: 520
+; admin_controller.php line: 524
 admin_controller_rolename_doesnt_exists = "Role Name Does not Exist"
 ;
-; admin_controller.php line: 526
+; admin_controller.php line: 530
 admin_controller_rolename_added = "Role Name Added"
 ;
-; admin_controller.php line: 537
+; admin_controller.php line: 541
 admin_controller_username_doesnt_exists = "Username Does Not Exist"
 ;
-; admin_controller.php line: 543
+; admin_controller.php line: 547
 admin_controller_rolename_doesnt_exists = "Role Name Does not Exist"
 ;
-; admin_controller.php line: 553
+; admin_controller.php line: 557
 admin_controller_rolename_deleted = "Role Name Deleted"
 ;
-; admin_controller.php line: 583
+; admin_controller.php line: 587
 admin_controller_select_rolename = "Select Role"
 ;
-; admin_controller.php line: 618
+; admin_controller.php line: 622
 admin_controller_select_activityname = "Select Activity"
 ;
-; admin_controller.php line: 651
+; admin_controller.php line: 655
 admin_controller_rolename_exists = "Role Name Exists"
 ;
-; admin_controller.php line: 661
+; admin_controller.php line: 665
 admin_controller_rolename_added = "Role Name Added"
 ;
-; admin_controller.php line: 672
+; admin_controller.php line: 676
 admin_controller_rolename_doesnt_exists = "Role Name Does not Exist"
 ;
-; admin_controller.php line: 680
+; admin_controller.php line: 684
 admin_controller_rolename_deleted = "Role Name Deleted"
 ;
-; admin_controller.php line: 686
+; admin_controller.php line: 690
 admin_controller_rolename_doesnt_exists = "Role Name Does not Exist"
 ;
-; admin_controller.php line: 692
+; admin_controller.php line: 696
 admin_controller_activityname_doesnt_exists = "Activity Name Does not Exist"
 ;
-; admin_controller.php line: 702
+; admin_controller.php line: 706
 admin_controller_activity_added = "Activity Added"
 ;
-; admin_controller.php line: 708
+; admin_controller.php line: 712
 admin_controller_rolename_doesnt_exists = "Role Name Does not Exist"
 ;
-; admin_controller.php line: 715
+; admin_controller.php line: 719
 admin_controller_activityname_doesnt_exists = "Activity Name Does not Exist"
 ;
-; admin_controller.php line: 727
+; admin_controller.php line: 731
 admin_controller_activity_deleted = "Activity Deleted"
 ;
-; admin_controller.php line: 773
+; admin_controller.php line: 777
 admin_controller_stop_crawl = "Stopping crawl. . .This will take a moment to refresh."
 ;
-; admin_controller.php line: 787
+; admin_controller.php line: 791
 admin_controller_resume_crawl = "Resuming crawl. . .This will take a moment to refresh."
 ;
-; admin_controller.php line: 817
+; admin_controller.php line: 821
 admin_controller_delete_crawl_success = "Deleting Crawl. . .This will take a moment to refresh."
 ;
-; admin_controller.php line: 822
+; admin_controller.php line: 826
 admin_controller_delete_crawl_fail = "Delete Crawl Failed!!"
 ;
-; admin_controller.php line: 829
+; admin_controller.php line: 833
 admin_controller_set_index = "Setting Crawl To Use as Index"
 ;
-; admin_controller.php line: 854
+; admin_controller.php line: 858
 admin_controller_starting_new_crawl = "Starting New Crawl!"
 ;
-; admin_controller.php line: 864
+; admin_controller.php line: 868
 admin_controller_no_description = "No Description for Crawl"
 ;
-; admin_controller.php line: 973
+; admin_controller.php line: 981
 admin_controller_use_below = "Use options below"
 ;
-; admin_controller.php line: 974
+; admin_controller.php line: 982
 admin_controller_use_defaults = "Use Yioop! defaults"
 ;
-; admin_controller.php line: 976
+; admin_controller.php line: 984
 admin_controller_use_below = "Use options below"
 ;
-; admin_controller.php line: 980
+; admin_controller.php line: 988
 admin_controller_previous_crawl = "Previous Crawl:"
 ;
-; admin_controller.php line: 1067
+; admin_controller.php line: 1075
 admin_controller_breadth_first = "Breadth First"
 ;
-; admin_controller.php line: 1069
+; admin_controller.php line: 1077
 admin_controller_page_importance = "Page Importance"
 ;
-; admin_controller.php line: 1135
+; admin_controller.php line: 1143
 admin_controller_urls_injected = "Urls Injected!"
 ;
-; admin_controller.php line: 1146
+; admin_controller.php line: 1154
 admin_controller_update_seed_info = "Updating Seed Site Info!"
 ;
-; admin_controller.php line: 1238
+; admin_controller.php line: 1246
 admin_controller_select_crawl = "Select Crawl"
 ;
-; admin_controller.php line: 1239
+; admin_controller.php line: 1247
 admin_controller_default_crawl = "Default Crawl"
 ;
-; admin_controller.php line: 1241
+; admin_controller.php line: 1249
 admin_controller_select_crawl = "Select Crawl"
 ;
-; admin_controller.php line: 1243
+; admin_controller.php line: 1251
 admin_controller_default_crawl = "Default Crawl"
 ;
-; admin_controller.php line: 1270
+; admin_controller.php line: 1278
 admin_controller_unnamed = "Unnamed Crawl"
 ;
-; admin_controller.php line: 1275
+; admin_controller.php line: 1283
 admin_controller_mix_created = "Crawl Mix Created!"
 ;
-; admin_controller.php line: 1284
+; admin_controller.php line: 1292
 admin_controller_set_index = "Setting Crawl To Use as Index"
 ;
-; admin_controller.php line: 1294
+; admin_controller.php line: 1302
 admin_controller_mix_doesnt_exists = "Mix to Delete Does not Exist!"
 ;
-; admin_controller.php line: 1302
+; admin_controller.php line: 1310
 admin_controller_mix_deleted = "Crawl Mix Deleted!"
 ;
-; admin_controller.php line: 1338
+; admin_controller.php line: 1346
 editmix_element_add_crawls = "Add Crawls"
 ;
-; admin_controller.php line: 1340
+; admin_controller.php line: 1348
 editmix_element_num_results = "Number of Results"
 ;
-; admin_controller.php line: 1341
+; admin_controller.php line: 1349
 editmix_element_del_grp = "Delete Group"
 ;
-; admin_controller.php line: 1342
+; admin_controller.php line: 1350
 editmix_element_weight = "Weight"
 ;
-; admin_controller.php line: 1343
+; admin_controller.php line: 1351
 editmix_element_name = "Name"
 ;
-; admin_controller.php line: 1344
+; admin_controller.php line: 1352
 editmix_add_keywords = "Keywords"
 ;
-; admin_controller.php line: 1345
+; admin_controller.php line: 1353
 editmix_element_actions = "Actions"
 ;
-; admin_controller.php line: 1346
+; admin_controller.php line: 1354
 editmix_add_query = "Add Query"
 ;
-; admin_controller.php line: 1347
+; admin_controller.php line: 1355
 editmix_element_delete = "Delete"
 ;
-; admin_controller.php line: 1399
+; admin_controller.php line: 1407
 admin_controller_mix_saved = "Crawl Mix Changes Saved!"
 ;
-; admin_controller.php line: 1453
+; admin_controller.php line: 1458
 admin_controller_use_below = "Use options below"
 ;
-; admin_controller.php line: 1454
+; admin_controller.php line: 1459
 admin_controller_use_defaults = "Use Yioop! defaults"
 ;
-; admin_controller.php line: 1456
+; admin_controller.php line: 1461
 admin_controller_use_below = "Use options below"
 ;
-; admin_controller.php line: 1460
+; admin_controller.php line: 1465
 admin_controller_previous_crawl = "Previous Crawl:"
 ;
-; admin_controller.php line: 1465
+; admin_controller.php line: 1470
 admin_controller_recrawl_never = "Never"
 ;
-; admin_controller.php line: 1466
+; admin_controller.php line: 1471
 admin_controller_recrawl_1day = "1 days"
 ;
-; admin_controller.php line: 1467
+; admin_controller.php line: 1472
 admin_controller_recrawl_2day = "2 days"
 ;
-; admin_controller.php line: 1468
+; admin_controller.php line: 1473
 admin_controller_recrawl_3day = "3 days"
 ;
-; admin_controller.php line: 1469
+; admin_controller.php line: 1474
 admin_controller_recrawl_7day = "7 days"
 ;
-; admin_controller.php line: 1470
+; admin_controller.php line: 1475
 admin_controller_recrawl_14day = "14 days"
 ;
-; admin_controller.php line: 1657
+; admin_controller.php line: 1685
 admin_controller_page_options_updated = "Page Options Updated!"
 ;
-; admin_controller.php line: 1683
+; admin_controller.php line: 1711
 admin_controller_page_options_running_tests = "Running Tests!"
 ;
-; admin_controller.php line: 1800
+; admin_controller.php line: 1851
+admin_controller_new_classifier = "New classifier created."
+;
+; admin_controller.php line: 1854
+admin_controller_classifier_exists = "A classifier with that name already exists."
+;
+; admin_controller.php line: 1866
+admin_controller_no_classifier = "No classifier with that name."
+;
+; admin_controller.php line: 1884
+admin_controller_finalizing_classifier = "Finalizing classifier."
+;
+; admin_controller.php line: 1907
+admin_controller_classifier_deleted = "Classifier deleted."
+;
+; admin_controller.php line: 1911
+admin_controller_no_classifier = "No classifier with that name."
+;
+; admin_controller.php line: 1960
+admin_controller_classifier_exists = "A classifier with that name already exists."
+;
+; admin_controller.php line: 1971
+editclassifier_load_failed = "Failed to load documents"
+;
+; admin_controller.php line: 1973
+editclassifier_loading = "Loading"
+;
+; admin_controller.php line: 1975
+editclassifier_added_examples = "Added {1} {2} examples"
+;
+; admin_controller.php line: 1977
+editclassifier_label_update_failed = "Failed to update labels."
+;
+; admin_controller.php line: 1979
+editclassifier_updating = "Updating"
+;
+; admin_controller.php line: 1981
+editclassifier_acc_update_failed = "Failed to update accuracy"
+;
+; admin_controller.php line: 1983
+editclassifier_na = "N/A"
+;
+; admin_controller.php line: 1985
+editclassifier_no_docs = "No documents"
+;
+; admin_controller.php line: 1987
+editclassifier_num_docs = "{1}{2} documents"
+;
+; admin_controller.php line: 1989
+editclassifier_in_class = "In Class"
+;
+; admin_controller.php line: 1991
+editclassifier_not_in_class = "Not In Class"
+;
+; admin_controller.php line: 1993
+editclassifier_skip = "Skip"
+;
+; admin_controller.php line: 1995
+editclassifier_prediction = "Prediction: {1}"
+;
+; admin_controller.php line: 1997
+editclassifier_scores = "{1}%% confidence, {2}%% disagreement"
+;
+; admin_controller.php line: 2041
 admin_controller_results_editor_update = "Filter Pages Updated!"
 ;
-; admin_controller.php line: 1814
+; admin_controller.php line: 2055
 admin_controller_edited_pages = "Select a Previously Edited URL"
 ;
-; admin_controller.php line: 1827
+; admin_controller.php line: 2068
 admin_controller_results_editor_need_url = "Result Page Update needs to Specify the URL!"
 ;
-; admin_controller.php line: 1833
+; admin_controller.php line: 2074
 admin_controller_results_editor_page_updated = "Result Page Updated!"
 ;
-; admin_controller.php line: 1846
+; admin_controller.php line: 2087
 admin_controller_results_editor_page_loaded = "Page Loaded!"
 ;
-; admin_controller.php line: 1891
+; admin_controller.php line: 2132
 admin_controller_select_machine = "Select Machine"
 ;
-; admin_controller.php line: 1962
+; admin_controller.php line: 2203
 admin_controller_machine_added = "Machine Added!"
 ;
-; admin_controller.php line: 1969
+; admin_controller.php line: 2210
 admin_controller_machine_exists = "Machine Name Already Exists; Please Delete First!"
 ;
-; admin_controller.php line: 1973
+; admin_controller.php line: 2214
 admin_controller_machine_incomplete = "Missing Fields From Machine Form!"
 ;
-; admin_controller.php line: 1982
+; admin_controller.php line: 2223
 admin_controller_machine_doesnt_exists = "Machine Name does not Exists!"
 ;
-; admin_controller.php line: 1999
+; admin_controller.php line: 2240
 admin_controller_stop_service_first = "Machine in use. Please stop the service running on it!"
 ;
-; admin_controller.php line: 2012
+; admin_controller.php line: 2253
 admin_controller_machine_deleted = "Machine Deleted!"
 ;
-; admin_controller.php line: 2033
+; admin_controller.php line: 2274
 admin_controller_news_process_running = "News Updater Seems To Be Running Already"
 ;
-; admin_controller.php line: 2041
+; admin_controller.php line: 2282
 admin_controller_news_mode_updated = "News Update Mode Changed!"
 ;
-; admin_controller.php line: 2045
+; admin_controller.php line: 2286
 admin_controller_news_update_failed = "News Update Mode Change Failed!"
 ;
-; admin_controller.php line: 2108
+; admin_controller.php line: 2349
 admin_controller_no_machine_log = "No Log File Found."
 ;
-; admin_controller.php line: 2137
+; admin_controller.php line: 2378
 admin_controller_machine_servers_updated = "Machine&#039;s Servers Updated!"
 ;
-; admin_controller.php line: 2141
+; admin_controller.php line: 2382
 admin_controller_machine_no_action = "Unable to Perform Action!"
 ;
-; admin_controller.php line: 2174
+; admin_controller.php line: 2415
 admin_controller_select_localename = "Select Locale"
 ;
-; admin_controller.php line: 2218
+; admin_controller.php line: 2459
 admin_controller_locale_added = "Locale Added!"
 ;
-; admin_controller.php line: 2225
+; admin_controller.php line: 2466
 admin_controller_localename_doesnt_exists = "Locale Does Not Exist!"
 ;
-; admin_controller.php line: 2234
+; admin_controller.php line: 2475
 admin_controller_localename_deleted = "Locale Deleted"
 ;
-; admin_controller.php line: 2243
+; admin_controller.php line: 2484
 admin_controller_select_staticpages = "Select a page"
 ;
-; admin_controller.php line: 2262
+; admin_controller.php line: 2503
 admin_controller_staticpage_updated = "Static Page Updated!"
 ;
-; admin_controller.php line: 2289
+; admin_controller.php line: 2530
 admin_controller_localestrings_updated = "Locale Strings Updated!"
 ;
-; admin_controller.php line: 2346
+; admin_controller.php line: 2587
 admin_controller_php_version = "PHP Version 5.3 or Newer"
 ;
-; admin_controller.php line: 2354
+; admin_controller.php line: 2595
 admin_controller_no_write_config_php = "configs/config.php not web server writable."
 ;
-; admin_controller.php line: 2359
+; admin_controller.php line: 2600
 admin_controller_no_write_work_dir = "Work directory needs to be writable by web server. "
 ;
-; admin_controller.php line: 2364
+; admin_controller.php line: 2605
 admin_controller_post_size_small = "php.ini file variable post_max_size should be at least 2M"
 ;
-; admin_controller.php line: 2370
+; admin_controller.php line: 2611
 admin_controller_missing_required = "The following required items were missing: %s"
 ;
-; admin_controller.php line: 2393
+; admin_controller.php line: 2634
 admin_controller_missing_optional = "The following optional items were missing: %s"
 ;
-; admin_controller.php line: 2398
+; admin_controller.php line: 2639
 admin_controller_check_passed = "Check Passed."
 ;
-; admin_controller.php line: 2403
+; admin_controller.php line: 2644
 admin_controller_using_local_config = "Using configs/local_config.php so changing work directory above may not work."
 ;
-; admin_controller.php line: 2428
+; admin_controller.php line: 2669
 admin_controller_media_kind = "Media Kind"
 ;
-; admin_controller.php line: 2429
+; admin_controller.php line: 2670
 admin_controller_video = "Video"
 ;
-; admin_controller.php line: 2430
-admin_controller_rss_feed = "News Feed"
+; admin_controller.php line: 2671
+admin_controller_rss_feed = "RSS"
 ;
-; admin_controller.php line: 2444
+; admin_controller.php line: 2685
 admin_controller_sources_indexes = "Index/Mix to Use"
 ;
-; admin_controller.php line: 2497
+; admin_controller.php line: 2738
 admin_controller_media_source_added = "Media Source Added!"
 ;
-; admin_controller.php line: 2505
+; admin_controller.php line: 2746
 admin_controller_media_source_deleted = "Media Source Deleted!"
 ;
-; admin_controller.php line: 2521
+; admin_controller.php line: 2762
 admin_controller_subsearch_added = "Subsearch Added!"
 ;
-; admin_controller.php line: 2529
+; admin_controller.php line: 2770
 admin_controller_subsearch_deleted = "Subsearch Deleted!"
 ;
-; admin_controller.php line: 2604
+; admin_controller.php line: 2845
 admin_controller_configure_use_absolute_path = "Must use an Absolute path for Work Directory"
 ;
-; admin_controller.php line: 2616
+; admin_controller.php line: 2857
 admin_controller_configure_diff_base_dir = "Work Directory cannot be contained in Yioop folder!"
 ;
-; admin_controller.php line: 2649
+; admin_controller.php line: 2890
 admin_controller_configure_work_dir_set = "Work Directory Set! You may need to re-login!"
 ;
-; admin_controller.php line: 2663
+; admin_controller.php line: 2904
 admin_controller_name_your_bot = "Please Name Your robot"
 ;
-; admin_controller.php line: 2686
+; admin_controller.php line: 2927
 admin_controller_configure_work_profile_made = "Working Directory and Profile Created!"
 ;
-; admin_controller.php line: 2699
+; admin_controller.php line: 2940
 admin_controller_configure_no_set_config = "Unable to Update config.php File!"
 ;
-; admin_controller.php line: 2711
+; admin_controller.php line: 2952
 admin_controller_configure_no_create_profile = "Unable to Create Profile!"
 ;
-; admin_controller.php line: 2722
+; admin_controller.php line: 2963
 admin_controller_configure_work_dir_invalid = "Work Directory is Invalid! Cannot Create Profile!"
 ;
-; admin_controller.php line: 2734
+; admin_controller.php line: 2975
 admin_controller_configure_work_dir_invalid = "Work Directory is Invalid! Cannot Create Profile!"
 ;
-; admin_controller.php line: 2782
+; admin_controller.php line: 3023
 admin_controller_configure_no_change_db = "Problem Updating Database!"
 ;
-; admin_controller.php line: 2797
+; admin_controller.php line: 3038
 admin_controller_configure_profile_change = "Profile Updated!"
 ;
-; admin_controller.php line: 2812
+; admin_controller.php line: 3053
 admin_controller_configure_no_change_profile = "There was a Problem Updating Profile!"
 ;
-; admin_controller.php line: 2850
+; admin_controller.php line: 3091
 admin_controller_describe_robot = "Please Describe Your Robot"
 ;
 ; machine_controller.php line: 182
@@ -448,55 +511,55 @@ search_controller_crawl_info = "Index: %s -- Size: %s pages/%s urls"
 ; search_controller.php line: 528
 search_controller_search = "Search"
 ;
-; search_controller.php line: 629
+; search_controller.php line: 636
 search_controller_no_index_set = "No Search Index Set For Use!"
 ;
-; search_controller.php line: 632
+; search_controller.php line: 639
 search_controller_no_index_set = "No Search Index Set For Use!"
 ;
-; search_controller.php line: 1430
+; search_controller.php line: 1438
 search_controller_no_archive_page = "The website in question has requested this page not be archived."
 ;
-; search_controller.php line: 1478
+; search_controller.php line: 1490
 search_controller_original_page = "This image appeared on the page:"
 ;
-; search_controller.php line: 1496
+; search_controller.php line: 1508
 search_controller_extracted_title = "Extracted Title"
 ;
-; search_controller.php line: 1498
+; search_controller.php line: 1510
 search_controller_extracted_description = "Extracted Description"
 ;
-; search_controller.php line: 1500
+; search_controller.php line: 1512
 search_controller_extracted_links = "Extracted Links"
 ;
-; search_controller.php line: 1505
+; search_controller.php line: 1517
 search_controller_extracted_allow_paths = "Extracted Allowed To Crawl Paths"
 ;
-; search_controller.php line: 1511
+; search_controller.php line: 1523
 search_controller_extracted_disallow_paths = "Extracted Disallowed To Crawl Paths"
 ;
-; search_controller.php line: 1517
+; search_controller.php line: 1529
 search_controller_crawl_delay = "YioopBot Crawl Delay"
 ;
-; search_controller.php line: 1590
+; search_controller.php line: 1600
 search_controller_cache_comment = "Yioop Cache Page... This page has been modified to add a robots directive,  make links absolute, add extracted summaries, and to highlight query terms."
 ;
-; search_controller.php line: 1636
+; search_controller.php line: 1644
 search_controller_cached_version = "This cached version of %s was obtained by the Yioop crawler on %s."
 ;
-; search_controller.php line: 1751
+; search_controller.php line: 1759
 search_controller_header_summaries = "Toggle Extracted Headers and Summaries"
 ;
-; search_controller.php line: 1875
+; search_controller.php line: 1883
 search_controller_history = "Toggle History"
 ;
-; search_controller.php line: 2051
+; search_controller.php line: 2059
 search_controller_all_cached = "All Cached Versions - Change Year and/or Months to see Links"
 ;
-; search_controller.php line: 2082
+; search_controller.php line: 2090
 search_controller_year = "Year:"
 ;
-; search_controller.php line: 2083
+; search_controller.php line: 2091
 search_controller_month = "Month:"
 ;
 ; settings_controller.php line: 134
@@ -779,6 +842,63 @@ crawloptions_element_need_api_for_mix = "Yioop API access required for mix archi
 ; crawloptions_element.php line: 167
 crawloptions_element_save_options = "Save Options"
 ;
+; editclassifier_element.php line: 63
+editclassifier_back = "Back"
+;
+; editclassifier_element.php line: 65
+editclassifier_edit_classifier = "Edit Classifier"
+;
+; editclassifier_element.php line: 77
+editclassifier_classifier_label = "Classifier Label:"
+;
+; editclassifier_element.php line: 82
+editclassifier_change = "Change"
+;
+; editclassifier_element.php line: 85
+editclassifier_statistics = "Statistics"
+;
+; editclassifier_element.php line: 86
+editclassifier_positive_examples = "Positive Examples:"
+;
+; editclassifier_element.php line: 89
+editclassifier_negative_examples = "Negative Examples:"
+;
+; editclassifier_element.php line: 92
+editclassifier_accuracy = "Accuracy:"
+;
+; editclassifier_element.php line: 97
+editclassifier_na = "N/A"
+;
+; editclassifier_element.php line: 102
+editclassifier_update = "Update"
+;
+; editclassifier_element.php line: 103
+editclassifier_add_examples = "Add Examples"
+;
+; editclassifier_element.php line: 107
+editclassifier_source = "Source:"
+;
+; editclassifier_element.php line: 111
+editclassifier_default_crawl = "Default Crawl"
+;
+; editclassifier_element.php line: 121
+editclassifier_label_by_hand = "Label By Hand"
+;
+; editclassifier_element.php line: 123
+editclassifier_all_in_class = "All In Class"
+;
+; editclassifier_element.php line: 125
+editclassifier_none_in_class = "None In Class"
+;
+; editclassifier_element.php line: 130
+editclassifier_keywords = "Keywords:"
+;
+; editclassifier_element.php line: 135
+editclassifier_load = "Load"
+;
+; editclassifier_element.php line: 141
+editclassifier_no_documents = "No Documents"
+;
 ; editlocales_element.php line: 62
 editlocales_element_back_to_manage = "Back"
 ;
@@ -869,6 +989,48 @@ manageaccount_element_retype_password = "Retype Password: "
 ; manageaccount_element.php line: 84
 manageaccount_element_save = "Save"
 ;
+; manageclassifiers_element.php line: 58
+manageclassifiers_manage_classifiers = "Manage Classifiers"
+;
+; manageclassifiers_element.php line: 66
+manageclassifiers_classifier_name = "Classifier Name:"
+;
+; manageclassifiers_element.php line: 71
+manageclassifiers_create_button = "Create"
+;
+; manageclassifiers_element.php line: 75
+manageclassifiers_available_classifiers = "Available Classifiers"
+;
+; manageclassifiers_element.php line: 78
+manageclassifiers_label_col = "Label"
+;
+; manageclassifiers_element.php line: 79
+manageclassifiers_positive_col = "Positive"
+;
+; manageclassifiers_element.php line: 80
+manageclassifiers_negative_col = "Negative"
+;
+; manageclassifiers_element.php line: 82
+manageclassifiers_actions_col = "Actions"
+;
+; manageclassifiers_element.php line: 95
+manageclassifiers_edit = "Edit"
+;
+; manageclassifiers_element.php line: 98
+manageclassifiers_finalized = "Finalized"
+;
+; manageclassifiers_element.php line: 104
+manageclassifiers_finalize = "Finalize"
+;
+; manageclassifiers_element.php line: 106
+manageclassifiers_finalize = "Finalize"
+;
+; manageclassifiers_element.php line: 109
+manageclassifiers_finalizing = "Finalizing"
+;
+; manageclassifiers_element.php line: 115
+manageclassifiers_delete = "Delete"
+;
 ; managecrawls_element.php line: 56
 managecrawls_element_create_crawl = "Create Crawl"
 ;
@@ -1076,121 +1238,127 @@ mixcrawl_search_index = "Search Index"
 ; mixcrawls_element.php line: 130
 mixcrawls_view_delete = "Delete"
 ;
-; pageoptions_element.php line: 66
+; pageoptions_element.php line: 63
 pageoptions_element_crawl_time = "Crawl Time"
 ;
-; pageoptions_element.php line: 71
+; pageoptions_element.php line: 68
 pageoptions_element_search_time = "Search Time"
 ;
-; pageoptions_element.php line: 76
+; pageoptions_element.php line: 73
 pageoptions_element_test_options = "Test Options"
 ;
-; pageoptions_element.php line: 88
+; pageoptions_element.php line: 85
 pageoptions_element_load_options = "Get Page Options From:"
 ;
-; pageoptions_element.php line: 93
+; pageoptions_element.php line: 90
 pageoptions_element_page_range = "Byte Range to Download (0 - Value):"
 ;
-; pageoptions_element.php line: 98
+; pageoptions_element.php line: 95
 pageoptions_element_save_cache = "Cache whole crawled pages:"
 ;
-; pageoptions_element.php line: 108
+; pageoptions_element.php line: 105
 pageoptions_element_allow_recrawl = "Allow Page Recrawl After:"
 ;
-; pageoptions_element.php line: 114
+; pageoptions_element.php line: 111
 pageoptions_element_file_types = "Page File Types to Crawl:"
 ;
-; pageoptions_element.php line: 142
+; pageoptions_element.php line: 141
+pageoptions_element_classifiers_to_apply = "Classifiers to Apply:"
+;
+; pageoptions_element.php line: 176
+pageoptions_element_no_classifiers = "No classifiers."
+;
+; pageoptions_element.php line: 179
 pageoptions_element_indexing_plugins = "Indexing Plugins"
 ;
-; pageoptions_element.php line: 145
+; pageoptions_element.php line: 183
 pageoptions_element_plugin = "Plugin"
 ;
-; pageoptions_element.php line: 148
+; pageoptions_element.php line: 186
 pageoptions_element_plugin_include = "Use in Crawl"
 ;
-; pageoptions_element.php line: 168
+; pageoptions_element.php line: 207
 pageoptions_element_no_compatible_plugins = "No compatible indexing plugins found!"
 ;
-; pageoptions_element.php line: 171
+; pageoptions_element.php line: 210
 pageoptions_element_page_rules = "Page Field Extraction Rules"
 ;
-; pageoptions_element.php line: 179
+; pageoptions_element.php line: 218
 page_element_search_page = "Search Page Elements and Links"
 ;
-; pageoptions_element.php line: 184
+; pageoptions_element.php line: 223
 pageoptions_element_wd_suggest = "Word Suggest"
 ;
-; pageoptions_element.php line: 192
+; pageoptions_element.php line: 231
 pageoptions_element_subsearch_link = "Subsearch"
 ;
-; pageoptions_element.php line: 201
+; pageoptions_element.php line: 240
 pageoptions_element_signin_link = "Signin"
 ;
-; pageoptions_element.php line: 208
+; pageoptions_element.php line: 247
 pageoptions_element_cache_link = "Cache"
 ;
-; pageoptions_element.php line: 216
+; pageoptions_element.php line: 255
 pageoptions_element_similar_link = "Similar"
 ;
-; pageoptions_element.php line: 224
+; pageoptions_element.php line: 263
 pageoptions_element_in_link = "Inlinks"
 ;
-; pageoptions_element.php line: 230
+; pageoptions_element.php line: 269
 pageoptions_element_ip_link = "IP Address"
 ;
-; pageoptions_element.php line: 239
+; pageoptions_element.php line: 278
 pageoptions_element_ranking_factors = "Search Ranking Factors"
 ;
-; pageoptions_element.php line: 242
+; pageoptions_element.php line: 281
 pageoptions_element_title_weight = "Title Weight:"
 ;
-; pageoptions_element.php line: 247
+; pageoptions_element.php line: 286
 pageoptions_element_description_weight = "Description Weight:"
 ;
-; pageoptions_element.php line: 252
+; pageoptions_element.php line: 291
 pageoptions_element_link_weight = "Link Weight:"
 ;
-; pageoptions_element.php line: 257
+; pageoptions_element.php line: 296
 pageoptions_element_results_grouping_options = "Search Results Grouping"
 ;
-; pageoptions_element.php line: 260
+; pageoptions_element.php line: 299
 pageoptions_element_min_results_to_group = "Minimum Results to Group:"
 ;
-; pageoptions_element.php line: 265
+; pageoptions_element.php line: 304
 pageoptions_element_server_alpha = "Server Alpha:"
 ;
-; pageoptions_element.php line: 273
+; pageoptions_element.php line: 312
 pageoptions_element_test_page = "Test Page"
 ;
-; pageoptions_element.php line: 275
+; pageoptions_element.php line: 314
 pageoptions_element_page_type = "Type:"
 ;
-; pageoptions_element.php line: 292
+; pageoptions_element.php line: 331
 pageoptions_element_save_options = "Save"
 ;
-; pageoptions_element.php line: 294
+; pageoptions_element.php line: 333
 pageoptions_element_run_tests = "Test Process Page"
 ;
-; pageoptions_element.php line: 300
+; pageoptions_element.php line: 339
 pageoptions_element_test_results = "Test Results"
 ;
-; pageoptions_element.php line: 303
+; pageoptions_element.php line: 342
 pageoptions_element_after_process = "After page processor extracts summary"
 ;
-; pageoptions_element.php line: 307
+; pageoptions_element.php line: 346
 pageoptions_element_after_rules = "After page rules applied"
 ;
-; pageoptions_element.php line: 311
+; pageoptions_element.php line: 350
 pageoptions_element_extracted_words = "Words and positions extracted to index from summary"
 ;
-; pageoptions_element.php line: 315
+; pageoptions_element.php line: 354
 pageoptions_element_extracted_metas = "Extracted meta words"
 ;
-; pageoptions_element.php line: 349
+; pageoptions_element.php line: 388
 pageoptions_element_run_tests = "Test Process Page"
 ;
-; pageoptions_element.php line: 353
+; pageoptions_element.php line: 392
 pageoptions_element_save_options = "Save"
 ;
 ; resultseditor_element.php line: 58
diff --git a/models/crawl_model.php b/models/crawl_model.php
index d87d44eec..227234cb3 100755
--- a/models/crawl_model.php
+++ b/models/crawl_model.php
@@ -36,6 +36,8 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

 /** For base class*/
 require_once BASE_DIR."/models/parallel_model.php";
+/** For deleting save points*/
+require_once BASE_DIR."/controllers/search_controller.php";

 /** used to prevent cache page requests from being logged*/
 if(!defined("POST_PROCESSING") && !defined("NO_LOGGING")) {
@@ -347,9 +349,27 @@ class CrawlModel extends ParallelModel implements CrawlConstants
         $this->db->execute($sql);
         $sql = "DELETE FROM MIX_COMPONENTS WHERE MIX_TIMESTAMP='$timestamp'";
         $this->db->execute($sql);
-
     }

+    /**
+     * Deletes the archive iterator and savepoint files created during the
+     * process of iterating through a crawl mix.
+     *
+     * @param int $timestamp The timestamp of the crawl mix
+     */
+    function deleteCrawlMixIteratorState($timestamp)
+    {
+        global $INDEXING_PLUGINS;
+        setLocaleObject(getLocaleTag());
+        $searchController = new SearchController($INDEXING_PLUGINS);
+        $searchController->clearQuerySavepoint($timestamp);
+
+        $archive_dir = WORK_DIRECTORY."/schedules/".
+            self::name_archive_iterator.$timestamp;
+        if (file_exists($archive_dir)) {
+            $this->db->unlinkRecursive($archive_dir);
+        }
+    }

     /**
      *  Returns the initial sites that a new crawl will start with along with
@@ -450,6 +470,14 @@ EOT;
         }
         $n[] = "";

+        $n[] = "[active_classifiers]";
+        if(isset($info['active_classifiers'])) {
+            foreach ($info['active_classifiers']['label'] as $label) {
+                $n[] = "label[] = '$label';";
+            }
+        }
+        $n[] = "";
+
         $site_types =
             array('allowed_sites' => 'url', 'disallowed_sites' => 'url',
                 'seed_sites' => 'url', 'page_rules'=>'rule');
@@ -589,7 +617,9 @@ EOT;
                 "disallowed_sites" => array(self::DISALLOWED_SITES, 'url'),
                 "page_rules" => array(self::PAGE_RULES, 'rule'),
                 "indexed_file_types" => array(self::INDEXED_FILE_TYPES,
-                    "extensions")
+                    "extensions"),
+                "active_classifiers" => array(self::ACTIVE_CLASSIFIERS,
+                    'label')
             );
             foreach($updatable_site_info as $type => $info) {
                 if(isset($new_info[$type][$info[1]])) {
@@ -900,7 +930,7 @@ EOT;
             $list[] = $crawl;
         }
         if($return_arc_bundles) {
-            $dirs = glob(CRAWL_DIR.'/cache/archives/*', GLOB_ONLYDIR);
+            $dirs = glob(CRAWL_DIR.'/archives/*', GLOB_ONLYDIR);
             foreach($dirs as $dir) {
                 $crawl = array();
                 $crawl['CRAWL_TIME'] = crc32($dir);
diff --git a/models/parallel_model.php b/models/parallel_model.php
index 143c72834..ebe8ca04f 100755
--- a/models/parallel_model.php
+++ b/models/parallel_model.php
@@ -443,15 +443,25 @@ class ParallelModel extends Model implements CrawlConstants
      */
     function clearQuerySavePoint($save_timestamp, $machine_urls = NULL)
     {
+        /*
+           It's important to quit early in the case that the timestamp is
+           empty, as this could result in deleting all SavePoint* files below.
+        */
+        if (!$save_timestamp) return;
+
         if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
             $this->execMachines("clearQuerySavePoint", $machine_urls,
                 $save_timestamp);
             return;
         }

-        $save_files = glob(CRAWL_DIR.'/schedules/'.self::save_point.
-            $save_timestamp."*.txt");
-        foreach($save_files as $save_file) {
+        /*
+           SavePoint files have a $qpart tagged on to the timestamp to
+           distinguish between parts of a query, so we want to delete anything
+           that starts with the appropriate timestamp.
+        */
+        $save_stub = CRAWL_DIR.'/schedules/'.self::save_point.$save_timestamp;
+        foreach (glob($save_stub.'*.txt') as $save_file) {
             @unlink($save_file);
         }
     }
diff --git a/models/phrase_model.php b/models/phrase_model.php
index d59aaab1e..7a17ba84d 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -92,7 +92,7 @@ class PhraseModel extends ParallelModel
             'filetype:', 'info:', '\-', 'os:', 'server:', 'date:', "numlinks:",
             'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:', 'time:', 'code:',
             'lang:', 'media:', 'elink:', 'location:', 'size:', 'host:', 'dns:',
-            'path:', 'robot:', 'safe:', 'guid:');
+            'path:', 'robot:', 'safe:', 'guid:', 'class:', 'class-score:');

     /**
      * Number of pages to cache in one go in memcache or filecache
@@ -401,7 +401,7 @@ class PhraseModel extends ParallelModel
         }
         if(isset($total_rows)) {
             $results['TOTAL_ROWS'] = $total_rows;
-        } else {
+        } else if (isset($results['PAGES'])) {
             $results['TOTAL_ROWS'] = count($results['PAGES']);
         }

diff --git a/models/profile_model.php b/models/profile_model.php
index 3e953e8ad..b9cfb4ab2 100644
--- a/models/profile_model.php
+++ b/models/profile_model.php
@@ -80,10 +80,11 @@ class ProfileModel extends Model
     {

         $to_make_dirs = array($directory, "$directory/app",
-            "$directory/cache", "$directory/data", "$directory/feeds",
-            "$directory/locale", "$directory/log",
-            "$directory/prepare", "$directory/schedules",
-            "$directory/search_filters", "$directory/temp");
+            "$directory/archives", "$directory/cache",
+            "$directory/classifiers", "$directory/data", "$directory/feeds",
+            "$directory/locale", "$directory/log", "$directory/prepare",
+            "$directory/schedules", "$directory/search_filters",
+            "$directory/temp");
         $dir_status = array();
         foreach($to_make_dirs as $dir) {
             $dir_status[$dir] = $this->createIfNecessaryDirectory($dir);
diff --git a/scripts/classifiers.js b/scripts/classifiers.js
new file mode 100644
index 000000000..8ed1b9131
--- /dev/null
+++ b/scripts/classifiers.js
@@ -0,0 +1,840 @@
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage javascript
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013s
+ * @filesource
+ */
+
+/**
+ * Implements the client interface for finding and labeling documents.
+ *
+ * Classifier behaves like a static class with some private variables and
+ * functions. The setup work is all done in the intitialize method, and after
+ * that all work is done in response to timeouts or user actions, such as
+ * button clicks.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage javascript
+ */
+var Classifier = (function() {
+    /**
+     * Maximum size of the document candidate pool. This constant is used to
+     * decide when to display, e.g., 50+ instead of just 50.
+     * @var int
+     */
+    var MAX_UNLABELLED_BUFFER_SIZE = 51;
+
+    /**
+     * The maximum number of previously-labeled document records to display.
+     * @var int
+     */
+    var MAX_LABELLED = 20;
+
+    /**
+     * How long to wait before adding another '.' to the end of a loading
+     * message. The advantage of choosing 333 is that the time to display three
+     * periods is roughly one second.
+     * @var int
+     */
+    var LOADING_REDRAW = 333;
+
+    // We return this at the bottom, so this is Classifier's public interface.
+    var self = {};
+
+    /**
+     * Gathers references to all relevant DOM elements, initializes state, and
+     * adds event handlers. Because AJAX requests to the administrative areas
+     * of Yioop must be authenticated, this method expects to be called with
+     * its first authentication token; each request will then yield a new token
+     * good for one more request.
+     *
+     * @param string classLabel label for the classifier being trained
+     * @param string authSession authentication token good for one request
+     * @param int authTime timestamp associated with the auth token
+     */
+    self.initialize = function(classLabel, authSession, authTime)
+    {
+        self.classLabel = classLabel;
+        self.authTime = authTime;
+        self.authSession = authSession;
+
+        self.elt = {
+            'positive_count': elt('positive-count'),
+            'negative_count': elt('negative-count'),
+            'accuracy': elt('accuracy'),
+            'update_accuracy': elt('update-accuracy'),
+            'label_docs_form': elt('label-docs-form'),
+            'label_docs_source': elt('label-docs-source'),
+            'label_docs_type': elt('label-docs-type'),
+            'label_docs_keywords': elt('label-docs-keywords'),
+            'label_docs_status': elt('label-docs-status'),
+            'label_docs_queue': null,
+        };
+
+        self.docCounter = 1;
+        self.documents = {};
+        self.activeDocument = null;
+        self.labelledDocQueue = [];
+        self.lastSource = null;
+        self.lastSourceType = null;
+        self.lastKeywords = null;
+        self.lastStatus = '';
+        self.loadingTimer = null;
+
+        self.elt.label_docs_form.onsubmit = function() {
+            self.requestDocuments();
+            return false;
+        }
+
+        self.elt.update_accuracy.onclick = function() {
+            if (!hasClass('disabled', self.elt.update_accuracy)) {
+                self.requestAccuracyUpdate();
+            }
+            return false;
+        }
+    };
+
+    /**
+     * Event handler called when the user clicks on any of the "In class", "Not
+     * in class", and "Skip" links associated with a document. This method
+     * updates the display and sends a request to the server to inform it of
+     * the user's decision and get the next document to be labeled.
+     *
+     * @param int docid key for the associated document
+     * @param string action 'inclass', 'notinclass', or 'skip'
+     */
+    self.handleAction = function(docid, action)
+    {
+        var doc = self.documents[docid];
+        var label;
+        switch (action)
+        {
+            case 'inclass':
+                label = 1;
+                break;
+            case 'notinclass':
+                label = -1;
+                break;
+            case 'skip':
+                label = 0;
+                break;
+        }
+
+        // Only send a request if something has changed.
+        if (doc.label === undefined || doc.label != label) {
+            self.sendNewLabel(doc, label);
+        }
+
+        // Update the class for benefit of the CSS
+        doc.element.className = 'labelled ' + action;
+        doc.label = label;
+
+        /*
+           If the labelled (or skipped) document was the active document, then
+           push it down on the labeled queue, shifting off the oldest document
+           if the queue is full.
+         */
+        if (doc == self.activeDocument) {
+            self.activeDocument = null;
+            if (self.labelledDocQueue.length == MAX_LABELLED) {
+                var droppedDoc = self.labelledDocQueue.shift();
+                droppedDoc.element.parentNode.removeChild(droppedDoc.element);
+            }
+            self.labelledDocQueue.push(doc);
+        }
+
+        return false;
+    }
+
+    /* PRIVATE INTERFACE */
+
+    /**
+     * Sends a request to load up a new candidate pool based on the selected
+     * index, index action, and optional query. The response behavior differs
+     * according to whether the index action specifies marking all candidates
+     * as positive or negative examples, or manual labeling. In the latter case
+     * the number of candidate documents (up to MAX_UNLABELLED_BUFFER_SIZE) is
+     * displayed, while in the former case the number of documents added to the
+     * pool is displayed.
+     */
+    self.requestDocuments = function()
+    {
+        self.lastSource = self.elt.label_docs_source.value;
+        self.lastSourceType = self.elt.label_docs_type.value;
+        self.lastKeywords = self.elt.label_docs_keywords.value;
+
+        var loading = loadingText(self.elt.label_docs_status,
+            tl['editclassifier_loading']);
+
+        sendRequest({
+            'url': '?c=classifier&a=classify&arg=getdocs',
+            'postdata': {
+                'session': self.authSession,
+                'time': self.authTime,
+                'label': self.classLabel,
+                'index': self.lastSource,
+                'type': self.lastSourceType,
+                'keywords': self.lastKeywords
+            },
+            'onSuccess': function(response) {
+                loading.clear();
+                self.authSession = response.authSession;
+                self.authTime = response.authTime;
+                self.clearActiveDocument();
+                if (response.new_doc) {
+                    self.setActiveDocument(response.new_doc);
+                }
+                if (response.add_count) {
+                    // Only present when mass-labeling.
+                    msg = format(tl['editclassifier_added_examples'],
+                        response.add_count, self.lastSourceType);
+                    self.setStatus(msg);
+                    self.drawStatistics(response);
+                } else {
+                    self.drawDocumentCount(response.num_docs);
+                }
+            },
+            'onFailure': function() {
+                loading.clear();
+                self.setStatus(tl['editclassifier_load_failed']);
+            }
+        });
+    }
+
+    /*
+     * Encodes any labels stored in the labels var as POST data, and sends a
+     * request to add these labels (using the document url as a key) to
+     * the classifier controller on the server. This method is called by the
+     * handleAction method in order to actually send the new label (or skip) to
+     * the server.
+     *
+     * @param object doc document to send a label for
+     * @param int label user-assigned label
+     */
+    self.sendNewLabel = function(doc, label)
+    {
+        var loading = loadingText(self.elt.label_docs_status,
+            tl['editclassifier_loading']);
+        sendRequest({
+            'url': '?c=classifier&a=classify&arg=addlabel',
+            'postdata': {
+                'session': self.authSession,
+                'time': self.authTime,
+                'label': self.classLabel,
+                'index': self.lastSource,
+                'type': self.lastSourceType,
+                'keywords': self.lastKeywords,
+                'doc_to_label': {
+                    'docid': doc.id,
+                    'key': doc.key,
+                    'label': label
+                }
+            },
+            'onSuccess': function(response) {
+                loading.clear();
+                self.authSession = response.authSession;
+                self.authTime = response.authTime;
+                if (response.new_doc) {
+                    /*
+                       There may still be an active document in the case that
+                       we were re-labelling an old document, but now we want to
+                       replace it.
+                     */
+                    self.clearActiveDocument();
+                    self.setActiveDocument(response.new_doc);
+                }
+                self.drawStatistics(response);
+                self.drawDocumentCount(response.num_docs);
+            },
+            'onFailure': function() {
+                loading.clear();
+                self.setStatus(tl['editclassifier_label_update_failed']);
+            }
+        });
+
+    }
+
+    /**
+     * Sends a request to the server to initiate an accuracy update, and on
+     * response updates the statistics (which includes reporting the current
+     * accuracy estimate, if any). Normally, the accuracy is only estimated
+     * each time a set number of documents have been added to the training set.
+     * The update accuracy functionality lets the user request an update
+     * without having to actually add more documents.
+     */
+    self.requestAccuracyUpdate = function()
+    {
+        var updating = tl['editclassifier_updating'];
+        var loading = loadingText(self.elt.update_accuracy, updating, {
+            'dots': false,
+            'className': 'disabled'
+        });
+        sendRequest({
+            'url': '?c=classifier&a=classify&arg=updateaccuracy',
+            'postdata': {
+                'session': self.authSession,
+                'time': self.authTime,
+                'label': self.classLabel,
+                'index': self.lastSource,
+                'type': self.lastSourceType,
+                'keywords': self.lastKeywords,
+            },
+            'onSuccess': function(response) {
+                self.authSession = response.authSession;
+                self.authTime = response.authTime;
+                self.drawStatistics(response);
+                loading.clear();
+            },
+            'onFailure': function() {
+                loading.clear();
+                self.setStatus(tl['editclassifier_acc_update_failed']);
+            }
+        });
+    }
+
+    /** Builds and displays a new active document record for the document data
+     * received from the server. This method both registers the document data
+     * in internal data structures, and creates the DOM structure to display
+     * the document to the user. If this is the very first document to be
+     * labeled since page load, then the table that holds documents is created
+     * before the new document is inserted into the DOM.
+     *
+     * @param object doc data structure representing the new active document
+     */
+    self.setActiveDocument = function(doc) {
+        doc.id = self.docCounter++;
+        self.documents[doc.id] = doc;
+        self.activeDocument = doc;
+
+        // Create table if it doesn't yet exist.
+        if (!self.elt.label_docs_queue) {
+            var queue = document.createElement('table');
+            queue.id = 'label-docs-queue';
+            self.elt.label_docs_form.parentNode.insertBefore(
+                    queue, self.elt.label_docs_form.nextElementSibling);
+            self.elt.label_docs_queue = queue;
+        }
+
+        var newRow = self.buildDocumentRow(doc);
+        doc.element = newRow;
+
+        var topDoc = self.elt.label_docs_queue.firstChild;
+        if (topDoc) {
+            self.elt.label_docs_queue.insertBefore(newRow, topDoc);
+        } else {
+            self.elt.label_docs_queue.appendChild(newRow);
+        }
+    }
+
+    /**
+     * Removes the active document from the DOM and from the internal set of
+     * documents completely. This is done when abandoning the current candidate
+     * pool for another, and is NOT the same as skipping the active document.
+     */
+    self.clearActiveDocument = function()
+    {
+        if (self.activeDocument) {
+            var topDoc = self.activeDocument.element;
+            self.elt.label_docs_queue.removeChild(topDoc);
+            delete self.documents[self.activeDocument.id];
+        }
+        self.activeDocument = null;
+    }
+
+    /**
+     * Updates the display of the counts of positive and negative examples and
+     * the estimated accuracy.  Each time the server responds to a request, it
+     * passes along the classifier's current counts and accuracy estimate to
+     * keep the client presentation of these statistics in sync.
+     *
+     * @param object response data from the last server request
+     */
+    self.drawStatistics = function(response)
+    {
+        self.elt.positive_count.innerHTML = response.positive;
+        self.elt.negative_count.innerHTML = response.negative;
+        if (response.accuracy === null) {
+            self.elt.accuracy.innerHTML = tl['editclassifier_na'];
+        } else {
+            self.elt.accuracy.innerHTML = format('{1}%',
+                (response.accuracy * 100).toFixed(1));
+        }
+    }
+
+    /**
+     * Updates the display of the number of documents currently in the
+     * candidate pool. Since candidates are being iterated over on the server
+     * rather than loaded in all at once, it is unknown exactly how many there
+     * are until the pool has been exhausted. To reflect this situation, when
+     * there are more candidates than will fit in the current pool, a plus sign
+     * is appended to the current count.
+     *
+     * @param int num_docs number of documents in the server's candidate pool
+     */
+    self.drawDocumentCount = function(num_docs)
+    {
+        var msg;
+        if (!num_docs) {
+            msg = tl['editclassifier_no_docs'];
+        } else {
+            var count, plus;
+            if (num_docs == MAX_UNLABELLED_BUFFER_SIZE) {
+                count = MAX_UNLABELLED_BUFFER_SIZE - 1;
+                plus = '+';
+            } else {
+                count = num_docs;
+                plus = '';
+            }
+            msg = format(tl['editclassifier_num_docs'], count, plus);
+        }
+        self.setStatus(msg);
+    }
+
+    /**
+     * A shortcut for setting the HTML of the element that displays document
+     * counts.
+     */
+    self.setStatus = function(msg)
+    {
+        self.elt.label_docs_status.innerHTML = msg;
+    }
+
+    /**
+     * Builds the DOM element representing a document. Each document is
+     * represented by a row in a table, where the row has two cells, the first
+     * dedicated to action links (e.g., for marking a document as a positive
+     * example) and the second to summarizing the document.
+     *
+     * @param object doc data structure representing the new document
+     * @return object table row DOM element representing the document
+     */
+    self.buildDocumentRow = function(doc)
+    {
+        var tr = document.createElement('tr');
+        tr.id = 'doc-' + doc.id;
+        tr.innerHTML =
+            tags('td', {'class': 'actions'},
+                self.buildActionLinkHTML(tl['editclassifier_in_class'],
+                    'inclass', doc),
+                self.buildActionLinkHTML(tl['editclassifier_not_in_class'],
+                    'notinclass', doc),
+                self.buildActionLinkHTML(tl['editclassifier_skip'],
+                    'skip', doc)
+            ) +
+            tags('td', {'class': 'info'},
+                tags('p', {'class': 'page-link'},
+                    tags('a', {'href': doc.cache_link}, doc.title)),
+                tags('p', {'class': 'echo-link'}, doc.url),
+                tags('p', {'class': 'prediction'},
+                    self.buildPredictionHTML(doc)),
+                doc.description && doc.description.length > 0 ?
+                    tags('p', {'class': 'description'}, doc.description) :
+                    ''
+            );
+        return tr;
+    }
+
+    /**
+     * Builds an anchor element used to allow a user to mark a document as a
+     * positive or negative example, or to skip it. The anchor has an onclick
+     * attribute that calls the handleAction method with the document id and
+     * action.
+     *
+     * @param string label anchor text displayed to the user
+     * @param string action action associated with this anchor
+     * @param object doc data structure representing the document the action
+     *  should be applied to
+     * @return object paragraph DOM element wrapping the created anchor
+     */
+    self.buildActionLinkHTML = function(label, action, doc)
+    {
+        var onclick = 'return Classifier.handleAction(' + doc.id +
+            ",'" + action + "')";
+        var link = tags('a', {
+            'class': action,
+            'href': '#' + action,
+            'onclick': onclick
+        }, label);
+        return tags('p', {}, '[', link, ']');
+    }
+
+    /**
+     * Builds an HTML string that displays the classification confidence and
+     * disagreement score associated with a document, using data sent from the
+     * server.
+     *
+     * @param object doc data structure representing the document
+     * @return string HTML string to be used to display confidence and
+     *  disagreement
+     */
+    self.buildPredictionHTML = function(doc)
+    {
+        label = (doc.positive ? '' : 'not ') + self.classLabel;
+        var prediction = format(tl['editclassifier_prediction'], label);
+        var scores = format(tl['editclassifier_scores'],
+            (doc.confidence * 100).toFixed(1),
+            (doc.disagreement * 100).toFixed(1));
+        return format('<b>{1}</b> ({2})', prediction, scores);
+    }
+
+    /* UTILITY FUNCTIONS */
+
+    /**
+     * Builds a string containing a pair of HTML tags with optional attributes
+     * and nested elements. All arguments but the tag name are optional, but if
+     * nested elements are to be supplied, then attributes for the opening tag
+     * must be supplied as well, even if they're empty. Attributes are
+     * specified as an object where the keys are attribute names and their
+     * values are strings. Each nested element may be either an HTML string or
+     * an array of HTML strings, all of which will be concatenated together.
+     * This function creates ONLY closed HTML tags (e.g., <td>...</td>, and not
+     * <img.../>); the tag function should be used to create self-closing HTML
+     * tags.
+     *
+     * @param string tagname opening and closing tag name
+     * @param object attributes optional object for which the keys are
+     *  attribute names, and the values are attribute values (may be empty)
+     * @param string|array nested... optional sequence of HTML strings or
+     *  arrays of HTML strings to be nested within the opening and closing tags
+     * @return string HTML string for the described element
+     */
+    function tags(tagname, attributes /* ... */)
+    {
+        var element = [makeOpenTag(tagname, attributes, '>')];
+        for (var i = 2; i < arguments.length; i++) {
+            var type = typeof(arguments[i]);
+            switch (type)
+            {
+                case 'object':
+                    element = element.concat(arguments[i]);
+                    break;
+                case 'string':
+                    if (arguments[i].length > 0)
+                        element.push(arguments[i]);
+                    break;
+            }
+        }
+        element.push('</' + tagname + '>');
+        return element.join('');
+    }
+
+    /**
+     * This function is just like the tags function, but creates a self-closing
+     * tag (e.g., <img.../>), which by necessity cannot contain nested
+     * elements.
+     *
+     * @param string tagname opening tag name
+     * @param object attributes optional object for which the keys are
+     *  attribute names, and the values are attribute values (may be empty)
+     * @return string HTML string for the described element
+     */
+    function tag(tagname, attributes)
+    {
+        return makeOpenTag(tagname, attributes, ' />');
+    }
+
+    /**
+     * A utility function to construct the opening tag of an HTML element, or a
+     * self-closing tag, along with optional attributes.
+     *
+     * @param string tagname opening tag name
+     * @param object attributes optional object for which the keys are
+     *  attribute names, and the values are attribute values (may be empty)
+     * @return string HTML string for the opening (or self-closing) tag
+     */
+    function makeOpenTag(tagname, attributes, endtag)
+    {
+        var tag = ['<' + tagname];
+        if (attributes) {
+            for (key in attributes) {
+                tag.push(' ' + key + '=' + '"' + attributes[key] + '"');
+            }
+        }
+        tag.push(endtag);
+        return tag.join('');
+    }
+
+    /**
+     * A simple string formatter that substitutes string arguments into a
+     * template string. The template string should contain substrings with the
+     * pattern '{\d+}' (e.g., {1}, {2}, ...), which will be replaced with the
+     * corresponding arguments passed to the format function. For example, any
+     * occurrence of '{1}' will be replaced by the first argument after the
+     * template string.
+     *
+     * @param string template template string that optionally contains sentinel
+     *  sequences of the form '{\d+}' to be replaced
+     * @param string arg... positional arguments to be substituted into the
+     *  template string
+     * @return string the template string with each sentinel pattern replaced
+     *  by the appropriate argument
+     */
+    function format(template /* ... */)
+    {
+        var args = arguments;
+        return template.replace(/\{(\d+)\}/g, function(match, i) {
+            var arg = args[parseInt(i)];
+            return typeof arg == 'object' ? JSON.stringify(arg) : arg;
+        });
+    }
+
+    /**
+     * Builds an XmlHttpRequest with optional POST data to be sent to the
+     * server, and calls the appropriate continuation function when the request
+     * completes or fails. The request is carried out asynchronously, and the
+     * response handlers are defined by the onSuccess and onFailure keys of the
+     * options object passed into this function. If the response content-type
+     * is set to application/json, then the response is JSON-decoded before
+     * being passed to the onSuccess handler. The options object supports the
+     * following keys:
+     *
+     *     string url: URL to send the request to (required)
+     *
+     *     string method: HTTP method to use (default GET, but changes to POST
+     *         if postdata is specified without also setting the method)
+     *
+     *     object postdata: object containing key/value pairs of POST arguments
+     *         to be sent with the request; the values are automatically
+     *         URI-encoded (optional)
+     *
+     *     function onSuccess: function to be called upon the completion of a
+     *         successful request; the response body is passed as the first and
+     *         only argument, JSON-decoded if the response content-type was
+     *         application/json (optional)
+     *
+     *     function onFailure: function called if the request times out or
+     *         otherwise can't be completed (optional)
+     *
+     * Example:
+     *
+     *     sendRequest({
+     *         'url': '?c=classifier&a=classify&arg=getdocs',
+     *         'postdata': {
+     *             'time': self.authTime,
+     *             'session': self.authSession,
+     *             'label': self.classLabel,
+     *             'mix': label_docs_source.value
+     *             'keywords': label_docs_keywords.value
+     *         },
+     *         'onSuccess': function(response) {
+     *             ...
+     *         },
+     *         'onFailure': function() {
+     *             ...
+     *         }
+     *     });
+     *
+     * @param object options request options.
+     */
+    function sendRequest(options)
+    {
+        if (!options.url) {
+            throw "sendRequest: 'url' option is required"
+        }
+
+        var method = options.method || 'GET';
+        var onSuccess = options.onSuccess || function() {};
+        var onFailure = options.onFailure || function() {};
+
+        var request = makeRequest();
+        if (!request) {
+            onFailure();
+            return false;
+        }
+
+        request.onreadystatechange = function() {
+            if (request.readyState == 4 && request.status == 200) {
+                var response = request.responseText;
+                var type = request.getResponseHeader('content-type');
+                if (type.match(/application\/json/)) {
+                    response = JSON.parse(response);
+                }
+                onSuccess(response);
+            }
+        }
+
+        if (options.postdata) {
+            var postdata = buildQueryString(options.postdata);
+            if (!options.method) {
+                method = 'POST';
+            }
+        }
+
+        request.open(method, options.url, true);
+
+        if (postdata) {
+            request.setRequestHeader("Content-type",
+                "application/x-www-form-urlencoded");
+            request.send(postdata);
+        } else {
+            request.send();
+        }
+    }
+
+    /**
+     * Recursively builds a query string from an object, URI-encoding any
+     * strings. Nested objects are handled using the standard HTTP notation for
+     * nested arrays; for example, the element accessed in object notation by
+     * a.b.c would be converted to a[b][c] in the query string.
+     *
+     * @param object obj optionally-nested object to be converted to a query
+     *  string
+     * @param string prefix optional prefix to prepend to keys in obj (used in
+     *  recursive calls)
+     * @return string query string representation of obj
+     */
+    function buildQueryString(obj, prefix)
+    {
+        var str = [];
+        for (var p in obj) {
+            p = encodeURIComponent(p);
+            var k = prefix ? prefix + "[" + p + "]" : p;
+            v = obj[p];
+            str.push(typeof v == "object" ?
+                    buildQueryString(v, k) :
+                    encodeURIComponent(k) + "=" + encodeURIComponent(v));
+        }
+        return str.join("&");
+    }
+
+    /**
+     * Removes a particular class from the passed-in element if it's present;
+     * otherwise does nothing.
+     *
+     * @param string className class name to remove
+     * @param object el DOM object to modify
+     */
+    function removeClass(className, el)
+    {
+        var re = RegExp('(^| )'+className+'( |$)');
+        el.className = el.className.replace(re, '$1');
+    }
+
+    /**
+     * Adds a particular class to the passed-in element; if the element already
+     * has the class then it is deleted and the re-added, which should have no
+     * significant effect.
+     *
+     * @param string className class name to add
+     * @param object el DOM object to modify
+     */
+    function addClass(className, el)
+    {
+        removeClass(className, el);
+        el.className += ' ' + className;
+    }
+
+    /**
+     * Returns true if the passed in element has a particular class, and false
+     * otherwise.
+     *
+     * @param string className the class to check for
+     * @param object el DOM object to query
+     * @return bool true if el has class className, and false otherwise
+     */
+    function hasClass(className, el)
+    {
+        var re = RegExp('(^| )'+className+'( |$)');
+        return el.className.search(re) != -1;
+    }
+
+    /**
+     * Places an element into a loading state, optionally adding a class and
+     * setting some text, and provides a method to call in order to cancel the
+     * loading state. The basic use case is to replace some text element with
+     * 'Loading...' text at the beginning of an asynchronous request, then to
+     * revert back to the pre-loading state once the request completes. This
+     * function returns an object with a clear method, which may be called in
+     * order to cancel the loading state. The options object may contain the
+     * following fields:
+     *
+     *     bool dots: whether to automatically append dots to the loading text
+     *         with the passage of a set time interval; the dots start over
+     *         each time they reach three (default true)
+     *
+     *     int dotsInterval: how long to wait before drawing the next dot
+     *         (default 333ms)
+     *
+     *     string className: class name to add to the element when loading
+     *         starts, and to remove when it completes (default none)
+     *
+     * Example:
+     *
+     *     var loading = loadingText(el, 'Loading');
+     *     someAsynchronousAction({
+     *         onComplete: function() {
+     *             loading.clear();
+     *             ...
+     *         }
+     *     });
+     *
+     * @param object el DOM object to be manipulated
+     * @param string text loading text with which to replace el's innerHTML
+     * @param object options loading options
+     * @return object object with a clear method, which can be called in order
+     *  to cancel the loading state, restoring everything to the way it was
+     *  before loading started
+     */
+    function loadingText(el, text, options)
+    {
+        if (options == undefined) {
+            options = {};
+        }
+        var oldHTML = el.innerHTML;
+        var drawDots = options.dots !== false;
+        var interval = options.dotsInterval || 333;
+        var timer;
+        if (drawDots) {
+            timer = window.setInterval(function() {
+                if (el.innerHTML.match(/\.{3}$/)) {
+                    el.innerHTML = text;
+                }  else {
+                    el.innerHTML += '.';
+                }
+            }, interval);
+        }
+        if (options.className) {
+            addClass(options.className, el);
+        }
+        el.innerHTML = text;
+        return obj = {
+            'clear': function() {
+                if (drawDots) {
+                    window.clearInterval(timer);
+                }
+                el.innerHTML = oldHTML;
+                if (options.className) {
+                    removeClass(options.className, el);
+                }
+            }
+        };
+    }
+
+    return self;
+})();
diff --git a/views/admin_view.php b/views/admin_view.php
index 5333b156c..e9f3687c0 100755
--- a/views/admin_view.php
+++ b/views/admin_view.php
@@ -54,8 +54,9 @@ class AdminView extends View
     var $elements = array("language", "activity", "signin",
         "managecrawls", "manageaccount", "manageusers", "manageroles",
         "mixcrawls", "managelocales", "editlocales", "crawloptions",
-        "editmix", "pageoptions", "resultseditor", "searchsources",
-        "managemachines", "machinelog", "editstatic", "configure");
+        "editmix", "pageoptions", "manageclassifiers", "editclassifier",
+        "resultseditor", "searchsources", "managemachines", "machinelog",
+        "editstatic", "configure");
     /** Names of helper objects that the view uses to help draw itself
      *  @var array
      */
diff --git a/views/elements/editclassifier_element.php b/views/elements/editclassifier_element.php
new file mode 100644
index 000000000..431e2cedf
--- /dev/null
+++ b/views/elements/editclassifier_element.php
@@ -0,0 +1,149 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage element
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * This element renders the initial edit page for a classifier, where the user
+ * can update the classifier label and find documents to label and add to the
+ * training set. The page displays some initial statistics and a form for
+ * finding documents in any existing index, but after that it is heavily
+ * modified by JavaScript in response to user actions and XmlHttpRequests
+ * made to the server.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage element
+ */
+class EditclassifierElement extends Element
+{
+    /**
+     * Draws the "edit classifier" element to the output buffers.
+     *
+     * @param array $data used to pass the class label, classifier instance,
+     *  and list of existing crawls
+     */
+    function render($data)
+    {
+        $classifier = $data['classifier'];
+    ?>
+        <div class="current-activity">
+        <div class="<?php e($data['leftorright']);?>">
+        <a href="?c=admin&amp;a=manageClassifiers&amp;<?php
+            e(CSRF_TOKEN.'='.$data[CSRF_TOKEN]) ?>"><?php
+            e(tl('editclassifier_back')) ?></a>
+        </div>
+        <h2><?php e(tl('editclassifier_edit_classifier')) ?></h2>
+        <form id="classifierForm" method="get" action="">
+        <input type="hidden" name="c" value="admin" />
+        <input type="hidden" name="<?php e(CSRF_TOKEN); ?>" value="<?php
+            e($data[CSRF_TOKEN]); ?>" />
+        <input type="hidden" name="a" value="manageClassifiers" />
+        <input type="hidden" name="arg" value="editclassifier" />
+        <input type="hidden" name="update" value="update" />
+        <input type="hidden" name="class_label"
+            value="<?php e($data['class_label']) ?>" />
+        <div class="top-margin">
+        <label for="rename-label"><?php
+            e(tl('editclassifier_classifier_label')) ?></label>
+            <input type="text" id="rename-label" name="rename_label"
+                value="<?php e($data['class_label']) ?>"
+                maxlength="80" class="wide-field"/>
+            <button class="button-box" type="submit"><?php
+                e(tl('editclassifier_change')); ?></button>
+        </div>
+        </form>
+        <h3><?php e(tl('editclassifier_statistics')) ?></h3>
+        <p><b><?php e(tl('editclassifier_positive_examples'))
+            ?></b> <span id="positive-count"><?php
+            e($classifier->positive) ?></span></p>
+        <p><b><?php e(tl('editclassifier_negative_examples'))
+            ?></b> <span id="negative-count"><?php
+            e($classifier->negative) ?></span></p>
+        <p><b><?php e(tl('editclassifier_accuracy'))
+            ?></b> <span id="accuracy"><?php
+            if (!is_null($classifier->accuracy)) {
+                printf('%.1f%%', $classifier->accuracy * 100);
+            } else {
+                e(tl('editclassifier_na'));
+            }?></span>
+            [<a id="update-accuracy" href="#update-accuracy"
+            <?php if ($classifier->total < 10) {
+                e('class="disabled"');
+            } ?>><?php e(tl('editclassifier_update')) ?></a>]</p>
+        <h3><?php e(tl('editclassifier_add_examples')) ?></h3>
+        <form id="label-docs-form" action="" method="GET">
+        <table>
+            <tr>
+            <th><?php e(tl('editclassifier_source')) ?></th>
+            <td>
+                <select id="label-docs-source" name="label_docs_source">
+                    <option value="1" selected="selected"><?php
+                        e(tl('editclassifier_default_crawl')) ?></option>
+                <?php foreach ($data['CRAWLS'] as $crawl) { ?>
+                    <option value="<?php e($crawl['CRAWL_TIME']) ?>"><?php
+                        e($crawl['DESCRIPTION']) ?></option>
+                <?php } ?>
+                </select>
+            </td>
+            <td>
+                <select id="label-docs-type" name="label_docs_type">
+                    <option value="manual" selected="selected"><?php
+                        e(tl('editclassifier_label_by_hand')) ?></option>
+                    <option value="positive"><?php
+                        e(tl('editclassifier_all_in_class')) ?></option>
+                    <option value="negative"><?php
+                        e(tl('editclassifier_none_in_class')) ?></option>
+                </select>
+            </td>
+            </tr>
+            <tr>
+                <th><?php e(tl('editclassifier_keywords')) ?></th>
+                <td colspan="2">
+                    <input type="text" maxlength="80" id="label-docs-keywords"
+                        name="label_docs_keywords" />
+                    <button class="button-box" type="submit"><?php
+                        e(tl('editclassifier_load')) ?></button>
+                </td>
+            </tr>
+            <tr>
+                <th>&nbsp;</th>
+                <td id="label-docs-status" colspan="2"><?php
+                    e(tl('editclassifier_no_documents')) ?></td>
+            </tr>
+        </table>
+        </form>
+    <?php
+    }
+}
+?>
diff --git a/views/elements/manageclassifiers_element.php b/views/elements/manageclassifiers_element.php
new file mode 100644
index 000000000..1c6575fde
--- /dev/null
+++ b/views/elements/manageclassifiers_element.php
@@ -0,0 +1,125 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage element
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2013
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ * This element renders the page that lists classifiers, provides a form to
+ * create new ones, and provides per-classifier action links to edit, finalize,
+ * and delete the associated classifier.
+ *
+ * @author Shawn Tice
+ * @package seek_quarry
+ * @subpackage element
+ */
+class ManageclassifiersElement extends Element
+{
+    /**
+     * Draws the "new classifier" form and table of existing classifiesr
+     *
+     * @param array $data used to pass the list of existing classifier
+     *  instances
+     */
+    function render($data)
+    {
+        $base_url = "?c=admin&amp;a=manageClassifiers&amp;".CSRF_TOKEN."=".
+            $data[CSRF_TOKEN]."&amp;arg=";
+        ?>
+        <div class="current-activity">
+        <h2><?php e(tl('manageclassifiers_manage_classifiers')) ?></h2>
+        <form id="classifiersForm" method="get" action=''>
+        <input type="hidden" name="c" value="admin" />
+        <input type="hidden" name="<?php e(CSRF_TOKEN); ?>" value="<?php
+            e($data[CSRF_TOKEN]); ?>" />
+        <input type="hidden" name="a" value="manageClassifiers" />
+        <input type="hidden" name="arg" value="createclassifier" />
+        <div class="top-margin"><label for="class-label"><?php
+            e(tl('manageclassifiers_classifier_name')) ?></label>
+            <input type="text" id="class-label" name="class_label"
+                value="" maxlength="80"
+                    class="wide-field"/>
+            <button class="button-box"  type="submit"><?php
+                e(tl('manageclassifiers_create_button')) ?></button>
+        </div>
+        </form>
+        <?php if (!empty($data['classifiers'])) { ?>
+        <h3><?php e(tl('manageclassifiers_available_classifiers')) ?></h3>
+        <table class="classifiers-table">
+            <tr>
+                <th><?php e(tl('manageclassifiers_label_col')) ?></th>
+                <th><?php e(tl('manageclassifiers_positive_col')) ?></th>
+                <th><?php e(tl('manageclassifiers_negative_col')) ?></th>
+                <th colspan="3"><?php
+                    e(tl('manageclassifiers_actions_col')) ?></th>
+            </tr>
+        <?php foreach ($data['classifiers'] as $label => $classifier) { ?>
+            <tr>
+                <td><b><?php e($label) ?></b><br />
+                    <small><?php e(date("d M Y H:i:s",
+                        $classifier->timestamp)) ?></small>
+                </td>
+                <td><?php e($classifier->positive) ?></td>
+                <td><?php e($classifier->negative) ?></td>
+                <td><a href="<?php e($base_url)
+                    ?>editclassifier&amp;class_label=<?php
+                    e($label) ?>"><?php
+                        e(tl('manageclassifiers_edit')) ?></a></td>
+                <td><?php
+                if ($classifier->finalized == Classifier::FINALIZED) {
+                    e(tl('manageclassifiers_finalized'));
+                } else if ($classifier->finalized == Classifier::UNFINALIZED) {
+                    if ($classifier->total > 0) {
+                        ?><a href="<?php e($base_url)
+                        ?>finalizeclassifier&amp;class_label=<?php
+                        e($label) ?>"><?php
+                            e(tl('manageclassifiers_finalize')) ?></a><?php
+                    } else {
+                        e(tl('manageclassifiers_finalize'));
+                    }
+                } else if ($classifier->finalized == Classifier::FINALIZING) {
+                    e(tl('manageclassifiers_finalizing'));
+                }
+                ?></td>
+                <td><a href="<?php e($base_url)
+                    ?>deleteclassifier&amp;class_label=<?php
+                    e($label) ?>"><?php
+                        e(tl('manageclassifiers_delete')) ?></a></td>
+            </tr>
+        <?php } // end foreach over classifiers ?>
+        </table>
+        <?php } // endif for available classifiers ?>
+        </div>
+    <?php
+    }
+}
+?>
diff --git a/views/elements/pageoptions_element.php b/views/elements/pageoptions_element.php
index 1288a7ce2..ae5101eaf 100644
--- a/views/elements/pageoptions_element.php
+++ b/views/elements/pageoptions_element.php
@@ -122,10 +122,12 @@ class PageOptionsElement extends Element
                     ?><td><table class="file-types-table" ><?php
                  }
        ?>
-            <tr><td><label for="<?php e($filetype); ?>-id"><?php
+            <tr><td><label for="filetype-<?php e($filetype); ?>-id"><?php
                 e($filetype); ?>
             </label></td><td><input type="checkbox" <?php e($checked) ?>
-                name="filetype[<?php  e($filetype); ?>]" value="true" /></td>
+                name="filetype[<?php  e($filetype); ?>]"
+                id="filetype-<?php  e($filetype); ?>-id"
+                value="true" /></td>
             </tr>
        <?php
                 $cnt++;
@@ -139,9 +141,47 @@ class PageOptionsElement extends Element
             }
         ?>
         </tr></table>
-            <div class="top-margin"><b><?php
-                e(tl("pageoptions_element_indexing_plugins"));?></b></div>
-        <?php if(isset($data['INDEXING_PLUGINS']) &&
+        <div class="top-margin"><b><?php
+            e(tl('pageoptions_element_classifiers_to_apply')) ?></b>
+       </div>
+       <?php if (!empty($data['CLASSIFIERS'])) { ?>
+           <table class="classifiers-all"><tr>
+           <?php $cnt = 0;
+                 $num_per_column = count($data['CLASSIFIERS']);
+                 if ($num_per_column > 5) {
+                     $num_per_column = ceil($num_per_column / 3);
+                 }
+                 foreach ($data['CLASSIFIERS'] as $label => $checked) {
+                     if ($cnt % $num_per_column == 0) {
+                        ?><td><table class="classifiers-table" ><?php
+                     }
+           ?>
+                <tr><td><label for="classifier-<?php e($label); ?>-id"><?php
+                    e($label); ?>
+                </label></td><td><input type="checkbox" <?php e($checked) ?>
+                    name="classifier[<?php  e($label); ?>]"
+                    id="classifier-<?php e($label) ?>-id" value="true" /></td>
+                </tr>
+           <?php
+                    $cnt++;
+                    if($cnt % $num_per_column == 0) {
+                        ?></table></td><?php
+                    }
+                }?>
+            <?php
+                if($cnt % $num_per_column != 0) {
+                    ?></table></td><?php
+                }
+            ?>
+            </tr></table>
+        <?php
+        } else {
+            e("<p class='red'>".
+                tl('pageoptions_element_no_classifiers').'</p>');
+        } ?>
+        <div class="top-margin"><b><?php
+            e(tl("pageoptions_element_indexing_plugins"));?></b></div>
+        <?php if(isset($data['INDEXING_PLUGINS']) &&
             count($data['INDEXING_PLUGINS']) > 0) { ?>
             <table class="indexing-plugin-table">
                 <tr><th><?php e(tl('pageoptions_element_plugin'));
ViewGit