Last commit for tests/CrawlQueueBundleTest.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2022  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2022
 * @filesource
 */
namespace seekquarry\yioop\tests;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\CrawlQueueBundle;
use seekquarry\yioop\library\UnitTest;

/**
 * UnitTest for the CrawlQueueBundle class.
 *
 * @author Chris Pollett
 */
class CrawlQueueBundleTest extends UnitTest
{
    /** our dbms manager handle so we can call unlinkRecursive
     * @var object
     */
    public $db;
    /**
     * Sets up a minimal DBMS manager class so that we will be able to use
     * unlinkRecursive to tear down own CrawlQueueBundle
     */
    public function __construct()
    {
        $db_class = C\NS_DATASOURCES . ucfirst(C\DBMS)."Manager";
        $this->db = new $db_class();
    }
    /**
     * Set up a web queue bundle that can store 1000 urls in ram, has bloom
     * filter space for 1000 urls and which uses a maximum value returning
     * priority queue.
     */
    public function setUp()
    {
        $this->test_objects['FILE1'] =
            new CrawlQueueBundle(C\WORK_DIRECTORY . "/QueueTest",
                1000, 1000);
    }
    /**
     * Delete the directory and files associated with the CrawlQueueBundle
     */
    public function tearDown()
    {
        $this->db->unlinkRecursive(C\WORK_DIRECTORY."/QueueTest");
    }
    /**
     * Tests the methods addRobotPaths and checkRobotOkay
     */
    public function addRobotPathsCheckRobotOkayTestCase()
    {
        $crawl_queue = $this->test_objects['FILE1'];
        $paths = [
            CrawlConstants::ALLOWED_SITES => ["/trapdoor"],
            CrawlConstants::DISALLOWED_SITES => ["/trap","/*?",'/cgi-bin'],
        ];
        $empty_paths = [CrawlConstants::ALLOWED_SITES => [],
            CrawlConstants::DISALLOWED_SITES => []];
        $robot_rows = [];
        $robot_rows[] = ["HOSTNAME" => "http://www.test.com",
            "CAPTURE_TIME" => time(), "CRAWL_DELAY" => 0,
            "ROBOT_PATHS" => $paths,
            "FLAGS" => 0];
        $robot_rows[] = ["HOSTNAME" => "http://www.cs.sjsu.edu",
            "CAPTURE_TIME" => time(), "CRAWL_DELAY" => 0,
            "ROBOT_PATHS" => $empty_paths,
            "FLAGS" => 0];
        $crawl_queue->robot_table->put($robot_rows);
        $test_urls = [
            ["http://www.cs.sjsu.edu/", true,
                "url with no stored rules"],
            ["http://www.test.com/trapdoor", true,
                "allowed url"],
            ["http://www.test.com/trapdoor?b", true,
                "allowed overrides all disallows"],
            ["http://www.test.com/trap", false,
                "forbidden url 1"],
            ["http://www.test.com/abc?", false,
                "forbidden url 2"],
            ["http://www.test.com/a?b", false,
                "forbidden url 3"],
            ["http://www.test.com/cgi-bin/psearch?-list", false,
                "forbidden url 4"],
        ];
        foreach ($test_urls as $test_url) {
            $result = $crawl_queue->checkRobotOkay($test_url[0]);
            $this->assertEqual($result, $test_url[1], $test_url[2]);
        }
    }
}
ViewGit