Last commit for models/parallel_model.php: 7b8be48aa40bd0881b36512aeb5e528f5bfbf2cb

Add Classification to Yioop a=shawn

Shawn Tice [2013-05-09 18:19:48]
Add Classification to Yioop a=shawn

This commit adds a new set of library files, major modifications to the admin
controller, a new classifiers controller, two new bin tools for training a
classifier via the web interface and testing classifiers, a new activity tab,
and localizations.

Signed-off-by: Chris Pollett <chris@pollett.org>
<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2012  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage model
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2012
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}


/**
 * Loads common constants for web crawling, used for index_data_base_name and
 * schedule_data_base_name
 */
require_once BASE_DIR."/lib/crawl_constants.php";
/**
 * Crawl data is stored in an IndexArchiveBundle, which are managed by the
 * IndexManager so load the definition of this class
 */
require_once BASE_DIR."/lib/index_manager.php";
/** For crawlHash function */
require_once BASE_DIR."/lib/utility.php";
/**
 * Needed for getHost
 */
require_once BASE_DIR.'/lib/url_parser.php';
/**
 * Needed to be able to send data via http to remote queue_servers
 */
require_once BASE_DIR.'/lib/fetch_url.php';

/**
 * Base class of models that need access to data from multiple queue servers
 *
 * @author Chris Pollett
 *
 * @package seek_quarry
 * @subpackage model
 */
class ParallelModel extends Model implements CrawlConstants
{
    /**
     * Stores the name of the current index archive to use to get search
     * results from
     * @var string
     */
    var $index_name;
    /**
     * If known the id of the queue_server this belongs to
     * @var int
     */
    var $current_machine;
    /**
     * the minimum length of a description before we stop appending
     * additional link doc summaries
     */
    const MIN_DESCRIPTION_LENGTH = 100;
    /**
     *  {@inheritdoc}
     */
    function __construct($db_name = DB_NAME)
    {
        parent::__construct($db_name);
        $this->current_machine = 0;//if known, controller will set later
    }

    /**
     * Get a summary of a document by the generation it is in
     * and its offset into the corresponding WebArchive.
     *
     * @param string $url of summary we are trying to look-up
     * @param array $machine_urls an array of urls of yioop queue servers
     * @param string $index_name timestamp of the index to do the lookup in
     * @return array summary data of the matching document
     */
    function getCrawlItem($url, $machine_urls = NULL, $index_name = "")
    {
        $hash_url = crawlHash($url, true);
        if($index_name == "") {
            $index_name = $this->index_name;
        }
        $results = $this->getCrawlItems(
            array($hash_url =>array($url, $index_name)), $machine_urls);
        if(isset($results[$hash_url])) {
            return $results[$hash_url];
        }
        return $results;
    }

    /**
     * Gets summaries for a set of document by their url, or by group of
     * 5-tuples of the form (machine, key, index, generation, offset).
     *
     * @param string $lookups things whose summaries we are trying to look up
     * @param array $machine_urls an array of urls of yioop queue servers
     * @return array of summary data for the matching documents
     */
    function getCrawlItems($lookups, $machine_urls = NULL)
    {
        $summaries = array();
        if($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
            $num_machines = count($machine_urls);
            $machines = array();
            foreach($lookups as $lookup => $lookup_info) {
                if(count($lookup_info) == 2 && $lookup_info[0][0] === 'h') {
                    list($url, $index_name) = $lookup_info;
                    $index = calculatePartition($url, $num_machines,
                        "UrlParser::getHost");
                    $machines[$index] = $machine_urls[$index];
                } else {
                    foreach($lookup_info as $lookup_item) {
                        if(count($lookup_item) == 5) {
                            list($index, , , , ) = $lookup_item;
                            $machines[$index] = $machine_urls[$index];
                        } else {
                            $machines = $machine_urls;
                            break;
                        }
                    }
                }

            }
            $page_set = $this->execMachines("getCrawlItems",
                $machines, serialize($lookups), $num_machines);

            if(is_array($page_set)) {
                foreach($page_set as $elt) {
                    $result = unserialize(webdecode($elt[self::PAGE]));
                    if(!is_array($result)) continue;
                    foreach($result as $lookup => $summary) {
                        if(isset($summaries[$lookup])) {
                            if(isset($summary[self::DESCRIPTION])) {
                                if(!isset($summaries[$lookup][
                                    self::DESCRIPTION])){
                                    $summaries[$lookup][self::DESCRIPTION] = "";
                                }
                                $summaries[$lookup][self::DESCRIPTION] = " .. ".
                                     $summary[self::DESCRIPTION];
                            }
                            foreach($summary as $attr => $value){
                                if($attr !=self::DESCRIPTION &&
                                    !isset($summaries[$lookup][$attr])) {
                                    $summaries[$lookup][$attr] = $value;
                                }
                            }
                        } else {
                            $summaries[$lookup] =  $summary;
                        }
                    }
                }
            }
            return $summaries;
        }
        foreach($lookups as $lookup => $lookup_info) {
            if(count($lookup_info) == 2 && $lookup_info[0][0] === 'h') {
                list($url, $index_name) = $lookup_info;
                $index_archive = IndexManager::getIndex($index_name);
                list($summary_offset, $generation) =
                    $this->lookupSummaryOffsetGeneration($url, $index_name);
                $summary =
                    $index_archive->getPage($summary_offset, $generation);
            } else {
                $summary = array();
                foreach($lookup_info as $lookup_item) {
                    if(count($lookup_item) == 2) {
                        list($word_key, $index_name) = $lookup_item;
                        $offset_info =
                            $this->lookupSummaryOffsetGeneration(
                                $word_key, $index_name, true);
                        if(is_array($offset_info)) {
                            list($summary_offset, $generation) = $offset_info;
                        } else {
                            continue;
                        }
                    } else {
                        list($machine, $key, $index_name, $generation,
                            $summary_offset) = $lookup_item;
                    }
                    $index = IndexManager::getIndex($index_name);
                    $index->setCurrentShard($generation, true);
                    $page = @$index->getPage($summary_offset);
                    if(!$page || $page == array()) {continue;}
                    $ellipsis_used = false;
                    $copy = false;
                    if($summary == array()) {
                        $summary = $page;
                    } else if (isset($page[self::DESCRIPTION])) {
                        if(!isset($summary[self::DESCRIPTION])) {
                            $summary[
                                self::DESCRIPTION] = "";
                        }
                        $summary[self::DESCRIPTION].=
                            " .. ".$page[self::DESCRIPTION];
                        $ellipsis_used = true;
                        $copy = true;
                    } else {
                        $copy = true;
                    }
                    if($ellipsis_used && strlen($summary[self::DESCRIPTION]) >
                        self::MIN_DESCRIPTION_LENGTH) {
                        /* want at least one ellipsis in case terms only
                           appear in links
                         */
                        break;
                    }
                    if($copy) {
                        foreach($page as $attr => $value){
                            if($attr !=self::DESCRIPTION &&
                                !isset($summary[$attr])) {
                                $summary[$attr] = $value;
                            }
                        }
                    }
                }
            }
            if($summary != array()) {
                $summaries[$lookup] = $summary;
            }
        }

        return $summaries;
    }

    /**
     * Determines the offset into the summaries WebArchiveBundle and generation
     * of the provided url (or hash_url) so that the info:url
     * (info:base64_hash_url) summary can be retrieved. This assumes of course
     * that the info:url  meta word has been stored.
     *
     * @param string $url_or_key either info:base64_hash_url or just a url to
     *      lookup
     * @param string $index_name index into which to do the lookup
     * @param bool $is_key whether the string is info:base64_hash_url or just a
     *      url
     * @return array (offset, generation) into the web archive bundle
     */
    function lookupSummaryOffsetGeneration($url_or_key, $index_name = "",
        $is_key = false)
    {
        if($index_name == "") {
            $index_name = $this->index_name;
        }
        $index_archive = IndexManager::getIndex($index_name);
        $num_retrieved = 0;
        $pages = array();
        $summary_offset = NULL;
        $num_generations = $index_archive->generation_info['ACTIVE'];
        $hash_key = ($is_key) ? crawlHash($url_or_key) :
            crawlHash("info:$url_or_key");
        $word_iterator = new WordIterator($hash_key, $index_name);

        if(is_array($next_docs = $word_iterator->nextDocsWithWord())) {
             foreach($next_docs as $doc_key => $doc_info) {
                 $summary_offset =
                    $doc_info[CrawlConstants::SUMMARY_OFFSET];
                 $generation = $doc_info[CrawlConstants::GENERATION];
                 $index_archive->setCurrentShard($generation, true);
                 $page = @$index_archive->getPage($summary_offset);
                 $num_retrieved++;
                 if($num_retrieved >=  1) {
                     break;
                 }
             }
             if($num_retrieved == 0) {
                return false;
             }
        } else {
            return false;
        }
        return array($summary_offset, $generation);
    }


    /**
     *  This method is invoked by other ParallelModel (@see CrawlModel
     *  for examples) methods when they want to have their method performed
     *  on an array of other  Yioop instances. The results returned can then
     *  be aggregated.  The invocation sequence is
     *  crawlModelMethodA invokes execMachine with a list of
     *  urls of other Yioop instances. execMachine makes REST requests of
     *  those instances of the given command and optional arguments
     *  This request would be handled by a CrawlController which in turn
     *  calls crawlModelMethodA on the given Yioop instance, serializes the
     *  result and gives it back to execMachine and then back to the originally
     *  calling function.
     *
     *  @param string $command the ParallelModel method to invoke on the remote
     *      Yioop instances
     *  @param array $machine_urls machines to invoke this command on
     *  @param string additional arguments to be passed to the remote machine
     *  @param int $num_machines the integer to be used in calculating partition
     *  @return array a list of outputs from each machine that was called.
     */
    function execMachines($command, $machine_urls, $arg = NULL,
        $num_machines = 0)
    {
        if($num_machines == 0) {
            $num_machines = count($machine_urls);
        }
        $time = time();
        $session = md5($time . AUTH_KEY);
        $query = "c=crawl&a=$command&time=$time&session=$session" .
            "&num=$num_machines";
        if($arg != NULL) {
            $arg = webencode($arg);
            $query .= "&arg=$arg";
        }
        $sites = array();
        $post_data = array();
        $i = 0;
        foreach($machine_urls as $index => $machine_url) {
            $sites[$i][CrawlConstants::URL] =  $machine_url;
            $post_data[$i] = $query."&i=$index";
            $i++;
        }
        $outputs = array();
        if(count($sites) > 0) {
            $outputs = FetchUrl::getPages($sites, false, 0, NULL, self::URL,
                self::PAGE, true, $post_data);
        }

        return $outputs;
    }

}
?>
ViewGit