Last commit for lib/fetch_url.php: 77c8db45b417901903ea930e9cb3b3ba7f4677f8

fetch_url.php modified to extract ETags from downloaded web pages and send ETags along with HTTP header

Akshat Kukreti [2013-05-23 03:10:09]
fetch_url.php modified to extract ETags from downloaded web pages and send ETags along with HTTP header

Signed-off-by: Chris Pollett <chris@pollett.org>
<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2012  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage library
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2012
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

/**
 * Reads in constants used as enums used for storing web sites
 */
require_once BASE_DIR."/lib/crawl_constants.php";

/**
 *
 * Code used to manage HTTP requests from one or more URLS
 *
 * @author Chris Pollett
 *
 * @package seek_quarry
 * @subpackage library
 */
class FetchUrl implements CrawlConstants
{

    /**
     * Make multi_curl requests for an array of sites with urls
     *
     * @param array $sites  an array containing urls of pages to request
     * @param bool $timer  flag, true means print timing statistics to log
     * @param int $page_range_request maximum number of bytes to download/page
     * @param string $key  the component of $sites[$i] that has the value of
     *      a url to get defaults to URL
     * @param string $value component of $sites[$i] in which to store the
     *      page that was gotten
     *
     *  @return array an updated array with the contents of those pages
     */

    public static function getPages($sites, $timer = false,
        $page_range_request = PAGE_RANGE_REQUEST,
        $key=CrawlConstants::URL, $value = CrawlConstants::PAGE)
    {
        $agent_handler = curl_multi_init();

        $active = NULL;

        $start_time = microtime();

        //Set-up requests
        for($i = 0; $i < count($sites); $i++) {
            if(isset($sites[$i][$key])) {
                $sites[$i][0] = curl_init();
                $ip_holder[$i] = fopen(CRAWL_DIR."/temp/tmp$i.txt", 'w+');
                curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT);
                $url = str_replace("&amp;", "&", $sites[$i][$key]);
                curl_setopt($sites[$i][0], CURLOPT_URL, $url);
                curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
                curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
                curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, true);
                curl_setopt($sites[$i][0], CURLOPT_MAXREDIRS, 5);
                curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
                curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
                curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT,PAGE_TIMEOUT);
                curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT);
                curl_setopt($sites[$i][0], CURLOPT_HEADER, true);
                curl_setopt($sites[$i][0], CURLOPT_ENCODING, "");
                   // ^ need to set for sites like att that use gzip
                if($page_range_request > 0) {
                    curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-".
                        $page_range_request);
                }
                curl_multi_add_handle($agent_handler, $sites[$i][0]);
            }
        }
        if($timer) {
            crawlLog("  Init Get Pages ".(changeInMicrotime($start_time)));
        }
        $start_time = microtime();
        $start = time();

        //Wait for responses
        do {
            $mrc = @curl_multi_exec($agent_handler, $active);
        } while (time() - $start < PAGE_TIMEOUT &&
            $mrc == CURLM_CALL_MULTI_PERFORM );

        if(time() - $start > PAGE_TIMEOUT) {crawlLog("  TIMED OUT!!!");}

        while (time()-$start < PAGE_TIMEOUT && $active && $mrc == CURLM_OK) {
            if (curl_multi_select($agent_handler, 1) != -1) {
                do {
                     $mrc = @curl_multi_exec($agent_handler, $active);
                } while (time()-$start < PAGE_TIMEOUT &&
                    $mrc == CURLM_CALL_MULTI_PERFORM);
            }
        }

        if($timer) {
            crawlLog("  Page Request time ".(changeInMicrotime($start_time)));
        }
        $start_time = microtime();

        //Process returned pages
        for($i = 0; $i < count($sites); $i++) {
            if(isset($ip_holder[$i]) ) {
                $ip_addresses = self::getCurlIp($ip_holder[$i]);
                fclose($ip_holder[$i]);
            }
            if(isset($sites[$i][0]) && $sites[$i][0]) {
                // Get Data and Message Code
                $content = @curl_multi_getcontent($sites[$i][0]);
                /*
                    If the Transfer-encoding was chunked then the Range header
                    we sent was ignored. So we manually truncate the data
                    here
                 */
                $content = substr($content, 0, $page_range_request);
                if(isset($content)) {
                    $site = self::parseHeaderPage($content, $value);
                    $sites[$i] = array_merge($sites[$i], $site);
                }

                $sites[$i][self::HTTP_CODE] =
                    curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE);
                if(!$sites[$i][self::HTTP_CODE]) {
                    $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]);
                }
                if($ip_addresses) {
                    $sites[$i][self::IP_ADDRESSES] = $ip_addresses;
                } else {
                    $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0");
                }

                //Get Time, Mime type and Character encoding
                $sites[$i][self::TIMESTAMP] = time();

                $type_parts =
                    explode(";", curl_getinfo($sites[$i][0],
                        CURLINFO_CONTENT_TYPE));

                $sites[$i][self::TYPE] = strtolower(trim($type_parts[0]));


                curl_multi_remove_handle($agent_handler, $sites[$i][0]);
                // curl_close($sites[$i][0]);
            } //end big if

        } //end for

        if($timer) {
            crawlLog("  Get Page Content time ".
                (changeInMicrotime($start_time)));
        }
        curl_multi_close($agent_handler);

        return $sites;
    }

    /**
     * Computes a hash of a string containing page data for use in
     * deduplication of pages with similar content
     *
     *  @param string &$page  web page data
     *  @return string 8 byte hash to identify page contents
     */
    public static function computePageHash(&$page)
    {
        /* to do dedup we strip script, noscript, and style tags
           as well as their content, then we strip tags, get rid
           of whitespace and hash
         */
        $strip_array =
            array('@<script[^>]*?>.*?</script>@si',
                '@<noscript[^>]*?>.*?</noscript>@si',
                '@<style[^>]*?>.*?</style>@si');
        $dedup_string = preg_replace(
            $strip_array, '', $page);
        $dedup_string_old = preg_replace(
            '/\W+/', '', $dedup_string);
        $dedup_string = strip_tags($dedup_string_old);
        if($dedup_string == "") {
            $dedup_string = $dedup_string_old;
        }
        $dedup_string = preg_replace(
            '/\W+/', '', $dedup_string);

        return crawlHash($dedup_string, true);
    }

    /**
     *  Splits an http response document into the http headers sent
     *  and the web page returned. Parses out useful information from
     *  the header and return an array of these two parts and the useful info.
     *
     *  @param string &$header_and_page
     *  @param string $value
     *  @return array info array consisting of a header, page for an http
     *      response, as well as parsed from the header the server, server
     *      version, operating system, encoding, and date information.
     */
    public static function parseHeaderPage(&$header_and_page,
        $value=CrawlConstants::PAGE)
    {
        $new_offset = 0;
        // header will include all redirect headers
        $site = array();
        $site[CrawlConstants::LOCATION] = array();
        do {
            $continue = false;
            $CRLFCRLF = strpos($header_and_page, "\x0D\x0A\x0D\x0A",
                $new_offset);
            $LFLF = strpos($header_and_page, "\x0A\x0A", $new_offset);
            //either two CRLF (what spec says) or two LF's to be safe
            $old_offset = $new_offset;
            $header_offset = ($CRLFCRLF > 0) ? $CRLFCRLF : $LFLF;
            $new_offset = ($CRLFCRLF > 0) ? $header_offset + 4
                : $header_offset + 2;
            $redirect_pos = stripos($header_and_page, 'Location:', $old_offset);
            $redirect_str = "Location:";
            if($redirect_pos === false) {
                $redirect_pos =
                    stripos($header_and_page, 'Refresh:', $old_offset);
                $redirect_str = "Refresh:";
            }
            if(isset($header_and_page[$redirect_pos - 1]) &&
                ord($header_and_page[$redirect_pos - 1]) > 32) {
                $redirect_pos = $new_offset; //ignore X-XRDS-Location header
            } else if($redirect_pos !== false && $redirect_pos < $new_offset){
                $redirect_pos += strlen($redirect_str);
                $pre_line = substr($header_and_page, $redirect_pos,
                    strpos($header_and_page, "\n", $redirect_pos) -
                    $redirect_pos);
                $loc = @trim($pre_line);
                if(strlen($loc) > 0) {
                    $site[CrawlConstants::LOCATION][] = @$loc;
                }
                $continue = true;
            }
        } while($continue);


        $site[CrawlConstants::HEADER] =
            substr($header_and_page, 0, $header_offset);
        $site[$value] = ltrim(substr($header_and_page, $header_offset));

        $lines = explode("\n", $site[CrawlConstants::HEADER]);
        $first_line = array_shift($lines);
        $response = preg_split("/(\s+)/", $first_line);
        $site[CrawlConstants::HTTP_CODE] = @trim($response[1]);
        foreach($lines as $line) {
            $line = trim($line);
            if(stristr($line, 'Server:')) {
                $server_parts = explode("Server:", $line);
                $server_name_parts = @explode("/", $server_parts[1]);
                $site[CrawlConstants::SERVER] = @trim($server_name_parts[0]);
                if(isset($server_name_parts[1])) {
                    $version_parts = explode("(", $server_name_parts[1]);
                    $site[CrawlConstants::SERVER_VERSION] =
                        @trim($version_parts[0]);
                    if(isset($version_parts[1])) {
                        $os_parts = explode(")", $version_parts[1]);
                        $site[CrawlConstants::OPERATING_SYSTEM] =
                            @trim($os_parts[0]);
                    }
                }
            }
            if(stristr($line, 'charset=')) {
                $line_parts = explode("charset=", $line);
                $site[CrawlConstants::ENCODING] =
                    strtoupper(@trim($line_parts[1]));
            }
            if(stristr($line, 'Last-Modified:')) {
                $line_parts = explode("Last-Modified:", $line);
                $site[CrawlConstants::MODIFIED] =
                    strtotime(@trim($line_parts[1]));
            }
        }
        /*
           If the doc is HTML and it uses a http-equiv to set the encoding
           then we override what the server says (if anything). As we
           are going to convert to UTF-8 we remove the charset info
           from the meta tag so cached pages will display correctly and
           redirects without char encoding won't be given a different hash.
         */
        $end_head = stripos($site[$value], "</head");
        if($end_head) {
            $reg = "charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?";
            mb_regex_encoding("UTF-8");
            mb_ereg_search_init($site[$value]);
            mb_ereg_search($reg);
            $match = mb_ereg_search_getregs();
            if(isset($match[0])) {
                $len_c = mb_strlen($match[0]);
                if(($match[6] == "'" || $match[6] == '"') &&
                   $match[3] != $match[6]) {
                    $len_c--;
                }
                $start_charset = strpos($site[$value], $match[0]);
                if($start_charset + $len_c < $end_head) {
                    if(isset($match[4])) {
                        $site[CrawlConstants::ENCODING] = strtoupper(
                            $match[4]);
                        $site[$value] = substr_replace(
                            $site[$value], "", $start_charset,
                            $len_c);
                    }
                }
            }
        }
        if(!isset($site[CrawlConstants::ENCODING])) {
            //else  fallback to auto-detect
            $site[CrawlConstants::ENCODING] =
                mb_detect_encoding($site[$value], 'auto');
        }

        if(!isset($site[CrawlConstants::SERVER]) ) {
            $site[CrawlConstants::SERVER] = "unknown";
        }
        return $site;
    }

    /**
     * Computes the IP address from a file pointer assumed to be pointing
     * at STDERR output from a curl request
     *
     * @param resource $fp a file pointer to STDERR of a curl request
     * @return string IPv4 address as a string of dot separated quads.
     */
    static function getCurlIp($fp)
    {
        rewind($fp);
        $str = fread($fp, 8192);
        if (preg_match_all('/\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/',
            $str, $matches)) {
            return array_unique($matches[0]);
        } else {
            return false;
        }
    }


    /**
     *  Make a curl request for the provide url
     *
     *  @param string $site  url of page to request
     *  @param string $post_data  any data to be POST'd to the URL
     *
     *  @return string the contents of what the curl request fetched
     */
    public static function getPage($site, $post_data = NULL)
    {
        $agent = curl_init();
        crawlLog("Init curl request");
        curl_setopt($agent, CURLOPT_USERAGENT, USER_AGENT);
        curl_setopt($agent, CURLOPT_URL, $site);
        curl_setopt($agent, CURLOPT_AUTOREFERER, true);
        curl_setopt($agent, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($agent, CURLOPT_NOSIGNAL, true);
        curl_setopt($agent, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($agent, CURLOPT_FAILONERROR, true);
        curl_setopt($agent, CURLOPT_TIMEOUT, PAGE_TIMEOUT);
        curl_setopt($agent, CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT);
        if($post_data != NULL) {
            curl_setopt($agent, CURLOPT_POST, true);
            curl_setopt($agent, CURLOPT_POSTFIELDS, $post_data);
        }
        crawlLog("Set curl options");
        $response = curl_exec($agent);
        crawlLog("Done curl exec");
        return $response;
    }
}
?>
ViewGit