Last commit for tests/UrlParserTest.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2022  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2022
 * @filesource
 */
namespace seekquarry\yioop\tests;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\BloomFilterFile;
use seekquarry\yioop\library\UrlParser;
use seekquarry\yioop\library\UnitTest;

/**
 * Used to test that the UrlParser class. For now, want to see that the
 * method canonicalLink is working correctly and that
 * isPathMemberRegexPaths (used in robot_processor.php) works
 *
 * @author Chris Pollett
 */
class UrlParserTest extends UnitTest
{
    /**
     * UrlParser uses static methods so doesn't do anything right now
     */
    public function setUp()
    {
    }
    /**
     * UrlParser uses static methods so doesn't do anything right now
     */
    public function tearDown()
    {
    }
    /**
     * Check if can go from a relative link, base link to a complete link
     * in various different ways
     */
    public function canonicalLinkTestCase()
    {
        $test_links = [
             [".", "http://www.example.com/",
                "http://www.example.com/", "root dir0"],
            ["/bob.html", "http://www.example.com/",
                "http://www.example.com/bob.html", "root dir1"],
            ["bob.html", "http://www.example.com/",
                "http://www.example.com/bob.html", "root dir2"],
            ["bob", "http://www.example.com/",
                "http://www.example.com/bob", "root dir3"],
            ["bob", "http://www.example.com",
                "http://www.example.com/bob", "root dir4"],
            ["http://print.bob.com/bob", "http://www.example.com",
                "http://print.bob.com/bob", "root dir5"],
            ["/.", "http://www.example.com/",
                "http://www.example.com/", "root dir6"],
            ["//slashdot.org", "http://www.slashdot.org",
                "http://slashdot.org/", "slashdot dir"],
            ["bob", "http://www.example.com/a",
                "http://www.example.com/a/bob", "sub dir1"],
            ["../bob", "http://www.example.com/a",
                "http://www.example.com/bob", "sub dir2"],
            ["../../bob", "http://www.example.com/a",
                null, "sub dir3"],
            ["./bob", "http://www.example.com/a",
                "http://www.example.com/a/bob", "sub dir4"],
            ["bob.html?a=1", "http://www.example.com/a",
                "http://www.example.com/a/bob.html?a=1", "query 1"],
            ["bob?a=1&b=2", "http://www.example.com/a",
                "http://www.example.com/a/bob?a=1&b=2", "query 2"],
            ["/?a=1&b=2", "http://www.example.com/a",
                "http://www.example.com/?a=1&b=2", "query 3"],
            ["?a=1&b=2", "http://www.example.com/a",
                "http://www.example.com/a/?a=1&b=2", "query 4"],
            ["b/b.html?a=1&b=2", "http://www.example.com/a/c",
                "http://www.example.com/a/c/b/b.html?a=1&b=2", "query 5"],
            ["b/b.html?a=1&b=2?c=4", "http://www.example.com/a/c",
                "http://www.example.com/a/c/b/b.html?a=1&b=2?c=4", "query 6"],
            ["b#1", "http://www.example.com/",
                "http://www.example.com/b#1", "fragment 1"],
            ["b?a=1#1", "http://www.example.com/",
                "http://www.example.com/b?a=1#1", "fragment 2"],
            ["b?a=1#1#2", "http://www.example.com/",
                "http://www.example.com/b?a=1#1#2", "fragment 3"],
            ["#a", "http://www.example.com/c:d",
                "http://www.example.com/c:d#a", "fragment 4"],
        ];
        foreach ($test_links as $test_link) {
            $result = UrlParser::canonicalLink($test_link[0],
                $test_link[1], false);
            $this->assertEqual($result, $test_link[2], $test_link[3]);
        }
    }
    /**
     * Check is a path matches with a list of paths presumably coming from
     * a robots.txt file
     */
    public function isPathMemberRegexPathsTestCase()
    {
        $path = [];
        $robot_paths = [];
        $results = [];
        $tests = [
            ["/bobby", ["/bob"], true, "Substring Positive"],
            ["/bobby", ["/alice", "/f/g/h/d"], false,
                "Substring Negative 1"],
            ["/bobby/", ["/bobby/bay", "/f/g/h/d", "/yo"], false,
                "Substring Negative 2"],
            ["/bay/bobby/", ["/bobby/", "/f/g/h/d", "/yo"], false,
                "Substring Negative 3 (should match start)"],
            ["http://test.com/bay/bobby/",
                ["/bobby/", "/f/g/h/d", "/yo"], false,
                "Substring Negative 4 (should match start)"],
            ["/a/bbbb/c/", ["/bobby/bay", "/a/*/c/", "/yo"], true,
                "Star Positive 1"],
            ["/a/bbbb/d/", ["/bobby/bay", "/a/*/c/", "/yo"], false,
                "Star Negative 1"],
            ["/test.html?a=b", ["/bobby/bay", "/*?", "/yo"], true,
                "Star Positive 2"],
            ["/test.html", ["/bobby/bay", "/*.html$", "/yo"], true,
                "Dollar Positive 1"],
            ["/test.htmlish", ["/bobby/bay", "/*.html$", "/yo"], false,
                "Dollar Negative 1"],
            ["/test.htmlish", ["/bobby/bay", "*", "/yo"], true,
                "Degenerate 1"],
            ["/test.html", ["/bobby/bay", "/**.html$", "/yo"], true,
                "Degenerate 2"],
            ["/videos/search?q=Angelina+Jolie",
                  ["/videos/search?"], true, "End With Question Regex Case 1"],
        ];
        foreach ($tests as $test) {
            list($path, $robot_paths, $result, $description) = $test;
            $this->assertEqual(UrlParser::isPathMemberRegexPaths($path,
                $robot_paths), $result, $description);
        }
    }
    /**
     * Tests simplifyUrl function used on SERP pages
     */
    public function simplifyUrlTestCase()
    {
        $test_urls = [
            ["http://www.example.com/", 100,
                "www.example.com", "HTTP Domain only"],
            ["https://www.example.com/", 100,
                "www.example.com", "HTTPS Domain only"],
            ["http://www.superreallylongexample.com/", 25,
                "www.superreallylonge...e.com", "Domain truncate"],
            ["http://www.example.com/word1/word2/word3/word4", 25,
                "www.example.com/word...word4", "Path truncate"],
        ];

        foreach ($test_urls as $test_url) {
            $result = UrlParser::simplifyUrl($test_url[0], $test_url[1]);
            $this->assertEqual($result, $test_url[2], $test_url[3]);
        }
    }
    /**
     * urlMemberSiteArray is a function called by both allowedToCrawlSite
     * disallowedToCrawlSite to test if a url belongs to alist of
     * regex's of urls or domain. This test function tests this functionality
     */
    public function urlMemberSiteArrayTestCase()
    {
        $sites = ["http://www.example.com/",
            "http://www.cs.sjsu.edu/faculty/pollett/*/*/",
            "http://www.bing.com/video/search?*&*&",
            "http://*.cool.*/a/*/", "domain:ucla.edu",
            "domain:foodnetwork.com",
            "domain:.ottawa.ca",
            "domain:.ottawa2.ca",
            "http://ottawa2.ca/"];
        $test_urls = [
            ["http://www.cs.sjsu.edu/faculty/pollett/", false,
                "regex url negative 1"],
            ["http://www.bing.com/video/search?", false,"regex url negative 2"],
            ["http://www.cool.edu/a", false, "regex url negative 3"],
            ["http://ucla.edu.com", false, "domain test negative"],
            ["http://www.cs.sjsu.edu/faculty/pollett/a/b/c", true,
                "regex url positive 1"],
            ["http://www.bing.com/video/search?a&b&c", true,
                "regex url positive 2"],
            ["http://www.cool.bob.edu/a/b/c", true, "regex url positive 3"],
            ["http://test.ucla.edu", true, "domain test positive"],
            ["https://test.ucla.edu", true, "domain https test positive"],
            ["gopher://test.ucla.edu", true, "domain gopher stest positive"],
            ["http://www.foodnetworkstore.com/small-appliances/", false,
                "domain test negative"],
            ["http://a.ottawa.ca/", true,
                "domain starting dot test positive 2"],
            ["http://ottawa.ca/", false, "domain starting dot test negative"],
            ["http://a.ottawa2.ca/", true,
                "domain starting dot test positive 2"],
            ["http://ottawa2.ca/", true, "domain starting dot test positive 3"],
        ];
        foreach ($test_urls as $test_url) {
            $result = UrlParser::urlMemberSiteArray($test_url[0], $sites,
                "s");
            $this->assertEqual($result, $test_url[1], $test_url[2]);
        }
    }
    /**
     * Checks if getScheme is working okay
     */
    public function getSchemeTestCase()
    {
        $test_links = [
            ["http://www.example.com/", "http", "Simple HTTP 1"],
            ["https://www.example.com/", "https", "Simple HTTPS 1"],
            ["gopher://www.example.com/", "gopher", "Simple GOPHER 1"],
            ["./", "http", "Simple HTTP 2"],
        ];
        foreach ($test_links as $test_link) {
            $result = UrlParser::getScheme($test_link[0]);
            $this->assertEqual($result, $test_link[1], $test_link[2]);
        }
    }
    /**
     * Checks if getHost is working okay
     */
    public function getHostTestCase()
    {
        $test_links = [
            ["https://somewhere.com:80/la/de/da", "https://somewhere.com:80",
                "Host with por"],
            ["https://10.1.10.10/?dfas=aga/", "https://10.1.10.10",
                "IP based host with query"],
            ["https://www.yioop.com/###@?woohoo", "https://www.yioop.com",
                "Host with fragment"],
        ];
        foreach ($test_links as $test_link) {
            $result = UrlParser::getHost($test_link[0]);
            $this->assertEqual($result, $test_link[1], $test_link[2]);
        }
    }
    /**
     * Checks if getCompanyLevelDomain is working okay
     */
    public function getCompanyLevelDomainTestCase()
    {
        $test_links = [
            ["https://www.somewhere.com/", "somewhere.com",
                "US URL Test 1"],
            ["https://aaa.bbb.cc.edu/", "cc.edu",
                "US URL Test 2"],
            ["https://somewhere.else.jp/", "else.jp", "Japanese URL Test 1"],
            ["https://somewhere.co.jp/", "somewhere.co.jp",
                "Japanese URL Test 2"],
            ["https://somewhere.com.jp/", "somewhere.com.jp",
                "Japanese URL Test 3"],
            ["https://somewhere.co.uk/", "somewhere.co.uk",
                "UK URL Test 1"],
        ];
        foreach ($test_links as $test_link) {
            $result = UrlParser::getCompanyLevelDomain($test_link[0]);
            $this->assertEqual($result, $test_link[1], $test_link[2]);
        }
    }
    /**
     * Checks if cullByDomainFilter is working okay. Only works if
     * there is a BloomFilterFile called C\WORK_DIRECTORY .
     *     "/data/domain_filters/top10000000.ftr" that at least contains
     * yahoo.com
     */
    public function cullByDomainFilterTestCase()
    {
        $filter_name = C\WORK_DIRECTORY .
            "/data/domain_filters/top10000000.ftr";
        if (!file_exists($filter_name)) {
            return;
        }
        $filter = BloomFilterFile::load($filter_name);
        $filters = [$filter];
        $test_links = [
            ["https://www.yahoo.com/foo", false,
                "Top Ten Million Site Test 1"],
            ["https://aaa.bbb.ccc.yahoo.com/foo/goo", false,
                "Top Ten Million Site Test 2"],
            ["https://pollett.org/foo/goo", false,
                "Not Top Ten Million Site Test 1"],
            ["https://www.pollett.org/foo/goo", false,
                "Not Top Ten Million Site Test 1"],
            ["https://aaa.bbb.ccc.pollett.org/foo/goo", true,
                "Not Top Ten Million Site Test 3"],
        ];
        foreach ($test_links as $test_link) {
            $result = UrlParser::cullByDomainFilter($test_link[0], $filters);
            $this->assertEqual($result, $test_link[1], $test_link[2]);
        }
    }
}
ViewGit