Last commit for src/library/classifiers/ChiSquaredFeatureSelection.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library\classifiers;

/**
 * A subclass of FeatureSelection that implements chi-squared feature
 * selection.
 *
 * This feature selection method scores each feature according to its
 * informativeness, then selects the top N most informative features, where N
 * is a run-time parameter.
 *
 * @author Shawn Tice
 */
class ChiSquaredFeatureSelection extends FeatureSelection
{
    /**
     * The maximum number of features to select, a runtime parameter.
     * @var int
     */
    public $max;
    /**
     * Uses the chi-squared feature selection algorithm to rank features by
     * informativeness, and return a map from old feature indices to new ones.
     *
     * @param object $features full feature set
     * @return array associative array mapping a subset of the original feature
     * indices to new indices
     */
    public function select(Features $features)
    {
        $n = $features->numFeatures();
        $selected = new \SplMinHeap();
        $allowed = isset($this->max) ? min($this->max, $n) : $n;
        $labels = [-1, 1];
        /*
           Start with 1, since 0 is dedicated to the constant intercept term;
           <= $n because n is the last feature.
         */
        for ($j = 1; $j <= $n; $j++) {
            $max_chi2 = 0.0;
            foreach ($labels as $label) {
                /*
                   t = term present
                   l = document has label
                   n = negation
                 */
                $stats = $features->varStats($j, $label);
                list($t_l, $t_nl, $nt_l, $nt_nl) = $stats;
                $num = ($t_l * $nt_nl) - ($t_nl * $nt_l);
                $den = ($t_l + $t_nl) * ($nt_l + $nt_nl);
                $chi2 = $den != 0 ? ($num * $num) / $den : INF;
                if ($chi2 > $max_chi2) {
                    $max_chi2 = $chi2;
                }
            }
            /*
               Keep track of top features in a heap, as we compute
               informativeness.
             */
            if ($allowed > 0) {
                $selected->insert(array($max_chi2, $j));
                $allowed -= 1;
            } else {
                list($other_chi2, $_) = $selected->top();
                if ($max_chi2 > $other_chi2) {
                    $selected->extract();
                    $selected->insert(array($max_chi2, $j));
                }
            }
        }
        return $this->buildMap($selected);
    }
}
ViewGit