Fixes to get classifier_tool.php working again, final tweaks to documentation for v0.96, a=chris

Chris Pollett [2013-07-24 06:Jul:th]
Fixes to get classifier_tool.php working again, final tweaks to documentation for v0.96, a=chris
Filename
bin/classifier_tool.php
controllers/controller.php
lib/utility.php
views/view.php
diff --git a/bin/classifier_tool.php b/bin/classifier_tool.php
index 73d1dbe92..51c804f87 100755
--- a/bin/classifier_tool.php
+++ b/bin/classifier_tool.php
@@ -55,6 +55,12 @@ if(!PROFILE) {
  */
 define("NO_CACHE", true);

+/** To use and manipulate classifiers */
+require_once BASE_DIR."/lib/classifiers/classifier.php";
+
+/** To manipulate crawl mixes using the controller's methods */
+require_once BASE_DIR."/controllers/classifier_controller.php";
+
 /**
  * Immediately throw an exception for all notices and warnings, rather than
  * letting execution continue.
@@ -75,10 +81,49 @@ function handleError($errno, $err_str, $err_file, $err_line)
 }
 set_error_handler('handleError');

-/** To use and manipulate classifiers */
-require_once BASE_DIR."/lib/classifiers/classifier.php";
-/** To manipulate crawl mixes using the controller's methods */
-require_once BASE_DIR."/controllers/classifier_controller.php";
+/**
+ * Instructions for how to use classifier tool
+ * @var string
+ */
+$INSTRUCTIONS = <<<EOD
+
+This tool is used to automate the building and testing of classifiers,
+providing an alternative to the web interface when a labeled training set is
+available.
+
+classifier_tool.php takes an activity to perform, the name of a dataset to use,
+and a label for the constructed classifier. The activity is the name of one
+of the 'run*' functions implemented by this class, without the common 'run'
+prefix (e.g., 'TrainAndTest'). The dataset is specified as the common prefix
+of two indexes that have the suffixes "Pos" and "Neg", respectively.  So if
+the prefix were "DATASET", then this tool would look for the two existing
+indexes "DATASET Pos" and "DATASET Neg" from which to draw positive and
+negative examples. Each document in these indexes should be a positive or
+negative example of the target class, according to whether it's in the "Pos"
+or "Neg" index. Finally, the label is just the label to be used for the
+constructed classifier.
+
+Beyond these options (set with the -a, -d, and -l flags), a number of other
+options may be set to alter parameters used by an activity or a classifier.
+These options are set using the -S, -I, -F, and -B flags, which correspond
+to string, integer, float, and boolean parameters respectively. These flags
+may be used repeatedly, and each expects an argument of the form NAME=VALUE,
+where NAME is the name of a parameter, and VALUE is a value parsed according
+to the flag. The NAME should match one of the keys of the options member of
+this class, where a period ('.') may be used to specify nesting.  For
+example:
+
+    -I debug=1         # set the debug level to 1
+    -B cls.use_nb=0    # tell the classifier to use Naive Bayes
+
+To build and evaluate a classifier for the label 'spam', trained using the
+two indexes "DATASET Neg" and "DATASET Pos", and a maximum of the top 25
+most informative features:
+
+php bin/classifier_tool.php -a TrainAndTest -d 'DATASET' -l 'spam'
+    -I cls.chi2.max=25
+
+EOD;

 /*
  *  We'll set up multi-byte string handling to use UTF-8
@@ -87,42 +132,11 @@ mb_internal_encoding("UTF-8");
 mb_regex_encoding("UTF-8");

 /**
- * This class is used to automate the building and testing of classifiers,
- * providing an alternative to the web interface when a labeled training set is
- * available.
- *
- * ClassifierTool takes an activity to perform, the name of a dataset to use,
- * and a label for the constructed classifier. The activity is the name of one
- * of the 'run*' functions implemented by this class, without the common 'run'
- * prefix (e.g., 'TrainAndTest'). The dataset is specified as the common prefix
- * of two indexes that have the suffixes "Pos" and "Neg", respectively.  So if
- * the prefix were "DATASET", then this tool would look for the two existing
- * indexes "DATASET Pos" and "DATASET Neg" from which to draw positive and
- * negative examples. Each document in these indexes should be a positive or
- * negative example of the target class, according to whether it's in the "Pos"
- * or "Neg" index. Finally, the label is just the label to be used for the
- * constructed classifier.
- *
- * Beyond these options (set with the -a, -d, and -l flags), a number of other
- * options may be set to alter parameters used by an activity or a classifier.
- * These options are set using the -S, -I, -F, and -B flags, which correspond
- * to string, integer, float, and boolean parameters respectively. These flags
- * may be used repeatedly, and each expects an argument of the form NAME=VALUE,
- * where NAME is the name of a parameter, and VALUE is a value parsed according
- * to the flag. The NAME should match one of the keys of the options member of
- * this class, where a period ('.') may be used to specify nesting.  For
- * example:
- *
- *    -I debug=1         # set the debug level to 1
- *    -B cls.use_nb=0    # tell the classifier to use Naive Bayes
- *
- * To build and evaluate a classifier for the label 'spam', trained using the
- * two indexes "DATASET Neg" and "DATASET Pos", and a maximum of the top 25
- * most informative features:
- *
- * php bin/classifier_tool.php -a TrainAndTest -d 'DATASET' -l 'spam'
- *    -I cls.chi2.max=25
+ * Class used to encapsulate all the activities of the classifier_tool.php
+ * command line script. This script allows one to automate the building and
+ * testing of classifiers, providing an alternative to the web interface when
  *
+ * a labeled training set is available.
  * @author Shawn Tice
  * @package seek_quarry
  */
@@ -258,12 +272,17 @@ class ClassifierTool
      */
     function main()
     {
+        global $argv, $INSTRUCTIONS;
+        if(count($argv) < 2) {
+            echo $INSTRUCTIONS;
+            exit(1);
+        }
         list($activity, $dataset_name, $label) = $this->parseOptions();
         $method = "run{$activity}";
         if (method_exists($this, $method)) {
             $this->$method($label, $dataset_name);
         } else {
-            echo "no activity: {$activity}\n";
+            echo "no activity: {$activity}\n\n";
             exit(1);
         }
     }
@@ -736,4 +755,4 @@ try {
 } catch (ErrorException $e) {
     echo $e . "\n";
 }
-?>
\ No newline at end of file
+?>
diff --git a/controllers/controller.php b/controllers/controller.php
index 958b25c64..b6763c7ea 100755
--- a/controllers/controller.php
+++ b/controllers/controller.php
@@ -83,10 +83,9 @@ abstract class Controller

             $this->$model_instance_name = new $model_name();
         }
-
         require_once BASE_DIR."/views/view.php";
-
         foreach($this->views as $view) {
+
             if(file_exists(APP_DIR."/views/".$view."_view.php")) {
                 require_once APP_DIR."/views/".$view."_view.php";
             } else {
@@ -97,6 +96,7 @@ abstract class Controller

             $this->$view_instance_name = new $view_name();
         }
+
         $this->indexing_plugins = $indexing_plugins;
         foreach($this->indexing_plugins as $plugin) {
             if(file_exists(APP_DIR.
@@ -111,7 +111,6 @@ abstract class Controller
             $plugin_instance_name = lcfirst($plugin_name);
             $this->$plugin_instance_name = new $plugin_name();
         }
-
     }

     /**
diff --git a/lib/utility.php b/lib/utility.php
index 7e7a7bcef..6f019b365 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -706,7 +706,16 @@ function crawlHash($string, $raw = false)
 }

 /**
- *
+ *  Used to create a 20 byte hash of a string (typically a word)
+ *  together with a string of meta data about the page that the word
+ *  appeared on (media:, safe:, class:) information
+ *
+ *  @param string $string word to hash
+ *  @param bool $raw whether to base64Hash the result
+ *  @param $meta_string the up to 11 byte string of meta information
+ *  @return string first 8 bytes of md5 of $string concatenated with \x00
+ *      to indicate the hash is of a word not a phrase concatenated  with the
+ *      padded to 11 byte $meta_string.
  */
 function crawlHashWord($string, $raw = false, $meta_string = "")
 {
diff --git a/views/view.php b/views/view.php
index 5b7c57b4f..b57b5d428 100755
--- a/views/view.php
+++ b/views/view.php
@@ -33,7 +33,9 @@

 if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

-$locale_version = tl('view_locale_version3');
+if(php_sapi_name() != 'cli') {
+    $locale_version = tl('view_locale_version3');
+}
 /**
  * Base View Class. A View is used to display
  * the output of controller activity
ViewGit