Last commit for tests/url_parser_test.php: cd4fc2fc97b5a991654f7c4032bc8eacc0d90df0

First pass fetcher-based link pruning, a=chris

Chris Pollett [2013-03-01 02:24:08]
First pass fetcher-based link pruning, a=chris
<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2012  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage test
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2012
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

/**
 *  Load the url parser library we'll be testing
 */
require_once BASE_DIR."/lib/url_parser.php";

/**
 *  Used to test that the UrlParser class. For now, want to see that the
 *  method canonicalLink is working correctly and that
 *  isPathMemberRegexPaths (used in robot_processor.php) works
 *
 *  @author Chris Pollett
 *  @package seek_quarry
 *  @subpackage test
 */
class UrlParserTest extends UnitTest
{
    /**
     * UrlParser uses static methods so doesn't do anything right now
     */
    public function setUp()
    {
    }

    /**
     * UrlParser uses static methods so doesn't do anything right now
     */
    public function tearDown()
    {
    }

    /**
     * Check if can go from a relative link, base link to a complete link
     * in various different ways
     */
    public function canonicalLinkTestCase()
    {
        $test_links = array(
            array(".", "http://www.example.com/",
                "http://www.example.com/", "root dir0"),
            array("/bob.html", "http://www.example.com/",
                "http://www.example.com/bob.html", "root dir1"),
            array("bob.html", "http://www.example.com/",
                "http://www.example.com/bob.html", "root dir2"),
            array("bob", "http://www.example.com/",
                "http://www.example.com/bob", "root dir3"),
            array("bob", "http://www.example.com",
                "http://www.example.com/bob", "root dir4"),
            array("http://print.bob.com/bob", "http://www.example.com",
                "http://print.bob.com/bob", "root dir5"),
            array("/.", "http://www.example.com/",
                "http://www.example.com/", "root dir6"),
            array("//slashdot.org", "http://www.slashdot.org",
                "http://slashdot.org/", "slashdot dir"),
            array("bob", "http://www.example.com/a",
                "http://www.example.com/a/bob", "sub dir1"),
            array("../bob", "http://www.example.com/a",
                "http://www.example.com/bob", "sub dir2"),
            array("../../bob", "http://www.example.com/a",
                NULL, "sub dir3"),
            array("./bob", "http://www.example.com/a",
                "http://www.example.com/a/bob", "sub dir4"),
            array("bob.html?a=1", "http://www.example.com/a",
                "http://www.example.com/a/bob.html?a=1", "query 1"),
            array("bob?a=1&b=2", "http://www.example.com/a",
                "http://www.example.com/a/bob?a=1&b=2", "query 2"),
            array("/?a=1&b=2", "http://www.example.com/a",
                "http://www.example.com/?a=1&b=2", "query 3"),
            array("?a=1&b=2", "http://www.example.com/a",
                "http://www.example.com/a/?a=1&b=2", "query 4"),
            array("b/b.html?a=1&b=2", "http://www.example.com/a/c",
                "http://www.example.com/a/c/b/b.html?a=1&b=2", "query 5"),
            array("b/b.html?a=1&b=2?c=4", "http://www.example.com/a/c",
                "http://www.example.com/a/c/b/b.html?a=1&b=2?c=4", "query 6"),
            array("b#1", "http://www.example.com/",
                "http://www.example.com/b#1", "fragment 1"),
            array("b?a=1#1", "http://www.example.com/",
                "http://www.example.com/b?a=1#1", "fragment 2"),
            array("b?a=1#1#2", "http://www.example.com/",
                "http://www.example.com/b?a=1#1#2", "fragment 3"),
            array("#a", "http://www.example.com/c:d",
                "http://www.example.com/c:d#a", "fragment 4"),
        );

        foreach($test_links as $test_link) {
            $result = UrlParser::canonicalLink($test_link[0],
                $test_link[1], false);
            $this->assertEqual($result, $test_link[2], $test_link[3]);
        }

    }


    /**
     * Check is a path matches with a list of paths presumably coming from
     * a robots.txt file
     */
    public function isPathMemberRegexPathsTestCase()
    {
        $path = array();
        $robot_paths = array();
        $results = array();
        $tests = array(
            array("/bobby", array("/bob"), true, "Substring Positive"),
            array("/bobby", array("/alice", "/f/g/h/d"), false,
                "Substring Negative 1"),
            array("/bobby/", array("/bobby/bay", "/f/g/h/d", "/yo"), false,
                "Substring Negative 2"),
            array("/bay/bobby/", array("/bobby/", "/f/g/h/d", "/yo"), false,
                "Substring Negative 3 (should match start)"),
            array("/a/bbbb/c/", array("/bobby/bay", "/a/*/c/", "/yo"), true,
                "Star Positive 1"),
            array("/a/bbbb/d/", array("/bobby/bay", "/a/*/c/", "/yo"), false,
                "Star Negative 1"),
            array("/test.html?a=b", array("/bobby/bay", "/*?", "/yo"), true,
                "Star Positive 2"),
            array("/test.html", array("/bobby/bay", "/*.html$", "/yo"), true,
                "Dollar Positive 1"),
            array("/test.htmlish", array("/bobby/bay", "/*.html$", "/yo"),false,
                "Dollar Negative 1"),
            array("/test.htmlish", array("/bobby/bay", "*", "/yo"),true,
                "Degenerate 1"),
            array("/test.html", array("/bobby/bay", "/**.html$", "/yo"), true,
                "Degenerate 2"),
            array("http://www.cs.sjsu.edu/faculty/pollett/",
                 array("http://www.cs.sjsu.edu/faculty/pollett/*/*/"), false,
                "URL Case 1"),
        );
        foreach ($tests as $test) {
            list($path, $robot_paths, $result, $description) = $test;
            $this->assertEqual(UrlParser::isPathMemberRegexPaths($path,
                $robot_paths), $result, $description);
        }
    }
}
?>
ViewGit