Last commit for src/library/FetchUrl.php: d3e3b85b48353dfd9156a4d82c9496c819837242

First pass replace Yandex translations with LM studio Llamma 3, first pass at doku wiki importer

Chris Pollett [2024-06-15 19:Jun:th]

First pass replace Yandex translations with LM studio Llamma 3, first pass at doku wiki importer

<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2018  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2018
 * @filesource
 */
namespace seekquarry\yioop\library;

use seekquarry\yioop\configs as C;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\UrlParser;

/** For Yioop global defines */
require_once __DIR__."/../configs/Config.php";
/**
 *
 * Code used to manage HTTP or Gopher requests from one or more URLS
 *
 * @author Chris Pollett
 */
class FetchUrl implements CrawlConstants
{
    /**
     * Make multi_curl requests for an array of sites with urls or onion urls
     *
     * @param array $sites  an array containing urls of pages to request
     * @param bool $timer  flag, true means print timing statistics to log
     * @param int $page_range_request maximum number of bytes to download/page
     *     0 means download all
     * @param string $temp_dir folder to store temporary ip header info
     * @param string $key  the component of $sites[$i] that has the value of
     *     a url to get defaults to URL
     * @param string $value component of $sites[$i] in which to store the
     *     page that was gotten
     * @param bool $minimal if true do a faster request of pages by not
     *     doing things like extract HTTP headers sent, etcs
     * @param array $post_data data to be POST'd to each site
     * @param bool $follow whether to follow redirects or not
     * @param string $tor_proxy url of a proxy that knows how to download
     *     .onion urls
     * @param array $proxy_servers if not [], then an array of proxy
     *     server to use rather than to directly download web pages from
     *     the current machine
     *
     * @return array an updated array with the contents of those pages
     */
    public static function getPages($sites, $timer = false,
        $page_range_request = C\PAGE_RANGE_REQUEST, $temp_dir = null,
        $key=CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal=false,
        $post_data = null, $follow = false, $tor_proxy = "",
        $proxy_servers=[])
    {
        static $agent_handler = null;
        static $handler_time = 0;
        if (empty($agent_handler)) {
            /* try to keep handler around between calls to allow for connection
                reuse
             */
            $agent_handler = curl_multi_init();
            $handler_time = microtime(true);
        }
        $active = null;
        $start_time = microtime(true);
        if (!$minimal && $temp_dir == null) {
            $temp_dir = C\CRAWL_DIR."/temp";
            if (!file_exists($temp_dir)) {
                mkdir($temp_dir);
            }
        }
        //Set-up requests
        $num_sites = count($sites);
        for ($i = 0; $i < $num_sites; $i++) {
            $is_gopher = false;
            $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
            $host = @parse_url($sites[$i][$key], PHP_URL_HOST);
            $local_hosts = ['localhost', '::1', '0.0.0.0',
                '127.0.0.1'];
            if (!empty($_SERVER['SERVER_NAME'])) {
                $local_hosts[] = $_SERVER['SERVER_NAME'];
                if (substr($_SERVER['SERVER_NAME'], 0, 3) != "www") {
                    $local_hosts[] = "www." . $_SERVER['SERVER_NAME'];
                }
            }
            if (C\nsdefined("IS_OWN_WEB_SERVER") &&
                in_array($host, $local_hosts)) {
                $port = @parse_url($sites[$i][$key], PHP_URL_PORT);
                if (empty($port)) {
                    $port = $_SERVER['SERVER_PORT'];
                }
                if ($port == $_SERVER['SERVER_PORT']) {
                    $sites[$i][0] = "INTERNAL";
                    if (empty($GLOBALS['web_site'])) {
                        $sites[$i][$value] = "";
                        continue;
                    }
                    $web_site = $GLOBALS['web_site'];
                    $sites[$i][0] = "INTERNAL";
                    $sites[$i][$value] = $web_site->processInternalRequest(
                        $sites[$i][$key], !$minimal, $post_data);
                    continue;
                }
            }
            if (isset($sites[$i][$key])) {
                list($sites[$i][$key], $url, $headers) =
                    self::prepareUrlHeaders($sites[$i][$key], $minimal,
                    $proxy_servers);
                if ($headers == "gopher") {
                    $is_gopher = true;
                    $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
                    $headers = [];
                }
                $sites[$i][0] = curl_init();
                if (!$minimal) {
                    $ip_holder[$i] = fopen("$temp_dir/tmp$i.txt", 'w+');
                    curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
                    curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
                }
                curl_setopt($sites[$i][0], CURLOPT_USERAGENT, C\USER_AGENT);
                curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE,
                    CURL_IPRESOLVE_WHATEVER);
                curl_setopt($sites[$i][0], CURLOPT_URL, $url);
                if (strcmp(substr($url,-10), "robots.txt") == 0 ) {
                    $sites[$i]['ROBOT'] = true;
                    $follow = true; /*wikipedia redirects their robot page. grr
                                      want to force this for robots pages
                                    */
                }
                curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
                curl_setopt($sites[$i][0], CURLOPT_HTTP_VERSION,
                    CURL_HTTP_VERSION_2_0);
                curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0);
                curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYPEER, false);
                curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
                curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
                curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT,
                    C\PAGE_TIMEOUT);
                curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, C\PAGE_TIMEOUT);
                if (stripos($url,'.onion') !== false && $tor_proxy != "") {
                    curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy);
                    //CURLPROXY_SOCKS5_HOSTNAME = 7
                    curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7);
                    if ($timer) {
                        crawlLog("Using Tor proxy for $url..");
                    }
                } else if ($proxy_servers != [] && !$is_gopher) {
                    $select_proxy = rand(0, count($proxy_servers) - 1);
                    $proxy_server = $proxy_servers[$select_proxy];
                    $proxy_parts = explode(":", $proxy_server);
                    $proxy_ip = $proxy_parts[0];
                    if (!isset($proxy_parts[2]) ||
                        strtolower($proxy_parts[2]) == 'http') {
                        $proxy_type = CURLPROXY_HTTP;
                    } else if (strtolower($proxy_parts[2]) == 'socks5') {
                        $proxy_type = CURLPROXY_SOCKS5;
                    } else {
                        $proxy_type = $proxy_parts[2];
                    }
                    if (isset($proxy_parts[1])) {
                        $proxy_port = $proxy_parts[1];
                    } else {
                        $proxy_port = "80";
                    }
                    curl_setopt($sites[$i][0], CURLOPT_PROXY,
                        "$proxy_ip:$proxy_port");
                    curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE,
                        $proxy_type);
                    if ($timer) {
                        crawlLog("Selecting proxy $select_proxy for $url");
                    }
                }
                if (!$minimal) {
                    curl_setopt($sites[$i][0], CURLOPT_HEADER, true);
                }
                //make lighttpd happier
                if (!$is_gopher) {
                    curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER,
                        $headers);
                }
                curl_setopt($sites[$i][0], CURLOPT_ENCODING, "");
                   // ^ need to set for sites like att that use gzip
                if ($page_range_request > 0 && empty(
                    $sites[$i][CrawlConstants::NO_RANGE])) {
                    curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-".
                        $page_range_request);
                } else if (!empty( $sites[$i][CrawlConstants::NO_RANGE])) {
                    crawlLog("No range used for $url");
                }
                if ($post_data != null) {
                    curl_setopt($sites[$i][0], CURLOPT_POST, true);
                    curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS,
                        $post_data[$i]);
                }
                curl_multi_add_handle($agent_handler, $sites[$i][0]);
            }
        }
        if ($timer) {
            crawlLog("  Init Get Pages ".(changeInMicrotime($start_time)));
        }
        $start_time = microtime(true);
        $start = time();
        //Wait for responses
        $running = null;
        $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7;
        $mrc_check = CURLM_CALL_MULTI_PERFORM;
        set_error_handler(null);
        do {
            $mrc = @curl_multi_exec($agent_handler, $running);
            /* 0.05 is to prevent this from being too busy a loop sucking
               up CPU cycle. We check every 0.05 if another page is ready of
               not*/
            if ($mrc != CURLM_CALL_MULTI_PERFORM) {
                $mrc_check = CURLM_OK;
                $ready = curl_multi_select($agent_handler, 0.05);
            }
        } while (memory_get_usage() < $memory_limit && $mrc == $mrc_check &&
            time() - $start < C\PAGE_TIMEOUT &&  $running > 0);
        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
        if (time() - $start > C\PAGE_TIMEOUT && $timer) {
            crawlLog("  TIMED OUT!!!");
        }
        if ($timer) {
            crawlLog("  Page Request time ".(changeInMicrotime($start_time)));
        }
        $start_time = microtime(true);
        //Process returned pages
        for ($i = 0; $i < $num_sites; $i++) {
            if ($timer) {
                crawlTimeoutLog("FetchUrl initial processing of page %s of %s",
                    $i, $num_sites);
            }
            if (!$minimal && isset($ip_holder[$i]) ) {
                rewind($ip_holder[$i]);
                $header = fread($ip_holder[$i], 8192);
                $ip_addresses = self::getCurlIp($header);
                fclose($ip_holder[$i]);
            }
            $is_gopher = false;
            $is_internal = false;
            if (!empty($sites[$i][0])) {
                // Get Data and Message Code
                if ($sites[$i][0] == 'INTERNAL') {
                    $is_internal = true;
                    $content = $sites[$i][$value];
                } else {
                    $content = @curl_multi_getcontent($sites[$i][0]);
                }
                $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL];
                /*
                    If the Transfer-encoding was chunked then the Range header
                    we sent was ignored. So we manually truncate the data
                    here
                 */
                if ($page_range_request > 0) {
                    $init_len = strlen($content);
                    $content = substr($content, 0, $page_range_request);
                    if (strlen($content) != $init_len) {
                        $sites[$i][CrawlConstants::CONTENT_SIZE] = $init_len;
                    }
                }
                if (isset($content) && !$minimal && !$is_gopher) {
                    $site = self::parseHeaderPage($content, $value);
                    $sites[$i] = array_merge($sites[$i], $site);
                    if (isset($header)) {
                        $header = substr($header, 0,
                            strpos($header, "\x0D\x0A\x0D\x0A") + 4);
                    } else {
                        $header = "";
                    }
                    $sites[$i][CrawlConstants::HEADER] =
                        $header . $sites[$i][CrawlConstants::HEADER];
                    unset($header);
                } else if (isset($content) && !$minimal && $is_gopher) {
                    $sites[$i][CrawlConstants::HEADER] =
                        $header;
                    $sites[$i][$value] = $content;
                    unset($header);
                } else {
                    $sites[$i][$value] = $content;
                }
                if (!$minimal) {
                    if ($is_internal) {
                        $sites[$i][self::SIZE] = strlen($content);
                        $sites[$i][self::DNS_TIME] = 0;
                        $sites[$i][self::TOTAL_TIME] = 0;
                        $sites[$i][self::HTTP_CODE] =
                            (empty($sites[$i][self::HTTP_CODE])) ? "404" :
                            $sites[$i][self::HTTP_CODE];
                    } else {
                        $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0],
                            CURLINFO_SIZE_DOWNLOAD);
                        $sites[$i][self::DNS_TIME] =
                            @curl_getinfo($sites[$i][0],
                            CURLINFO_NAMELOOKUP_TIME);
                        $sites[$i][self::TOTAL_TIME] =
                            @curl_getinfo($sites[$i][0],
                            CURLINFO_TOTAL_TIME);
                        $sites[$i][self::HTTP_CODE] =
                            curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE);
                        if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) {
                            $sites[$i][self::HTTP_CODE] =
                                curl_error($sites[$i][0]);
                        } else if ($is_gopher) {
                            $sites[$i][self::HTTP_CODE] = 200;
                        }
                    }
                    if (!empty($ip_addresses)) {
                        $sites[$i][self::IP_ADDRESSES] = $ip_addresses;
                    } else {
                        $sites[$i][self::IP_ADDRESSES] = ["0.0.0.0"];
                    }
                    //Get Time, Mime type and Character encoding
                    $sites[$i][self::TIMESTAMP] = time();
                    if ($is_gopher) {
                        $path = UrlParser::getPath($sites[$i][self::URL]);
                        $filename =
                            UrlParser::getDocumentFilename(
                                $sites[$i][self::URL]);
                        if (isset($path[1])) {
                            $gopher_type = $path[1];
                        } else {
                            $gopher_type = 1;
                        }
                        if ($gopher_type == 1) {
                            $sites[$i][self::TYPE] = "text/gopher";
                        } else if (in_array($gopher_type,
                            [0, 3, 6])) {
                            $sites[$i][self::TYPE] = "text/plain";
                            if ($gopher_type == 6) {
                                $sites[$i][$value] = convert_uudecode(
                                    $content);
                            }
                        } else if ($gopher_type == 'h') {
                            $sites[$i][self::TYPE] = "text/html";
                        } else if ($gopher_type == 'g') {
                            $sites[$i][self::TYPE] = "image/gif";
                        }
                        $path_info = pathinfo($filename);
                        if (!isset($sites[$i][self::TYPE]) &&
                            isset($path_info['extension'])) {
                            $sites[$i][self::TYPE] =
                                UrlParser::guessMimeTypeFromFileName($filename);
                        } else if (!isset($sites[$i][self::TYPE])) {
                            $sites[$i][self::TYPE] = "unknown";
                        }
                    } else if (!$is_internal) {
                        $type_parts =
                            explode(";", curl_getinfo($sites[$i][0],
                                CURLINFO_CONTENT_TYPE));
                        $sites[$i][self::TYPE] =
                            strtolower(trim($type_parts[0]));
                    } else {
                        $sites[$i][self::TYPE] = "unknown";
                    }
                }
                 /*
                   Ideally should have line for all requests
                   However, this seems to cause curl to sometimes crash
                   by trying to free stuff twice on some linux systems
                   at crawl time. Not having it on other systems causes crashes
                   at query time
                */
                if ($sites[$i][0] != 'INTERNAL') {
                    if ($minimal || !stristr(PHP_OS, "LINUX")) {
                        curl_multi_remove_handle($agent_handler, $sites[$i][0]);
                    }
                    curl_close($sites[$i][0]);
                }
                if (!empty($sites[$i]['ROBOT'])) {
                    if (isset($sites[$i][self::TYPE]) &&
                        $sites[$i][self::TYPE] != "text/plain" &&
                        isset($sites[$i][CrawlConstants::LOCATION]) &&
                        count($site[CrawlConstants::LOCATION]) > 0) {
                        $sites[$i][self::TYPE] = "text/plain";
                        $sites[$i][self::HTTP_CODE] = "200";
                        $tmp = wordwrap($sites[$i][$value], 80);
                        $tmp_parts = explode("\n", $tmp);
                        $tmp = "# Suspect server misconfiguration\n";
                        $tmp .= "# Assume shouldn't crawl this site.\n";
                        $tmp .= "# Pretending got following robots.txt.\n";
                        $tmp .= "User-agent: *\n";
                        $tmp .= "Disallow: /\n";
                        $tmp .= "# Original error code: ".
                            $sites[$i][self::HTTP_CODE]."\n";
                        $tmp .= "# Original content:\n";
                        foreach ($tmp_parts as $part) {
                            $tmp = "#".$part."\n";
                        }
                        $sites[$i][$value] = $tmp;
                        $sites[$i][self::HTTP_CODE] = "200";
                        unset($site[CrawlConstants::LOCATION]);
                    }
                }
            } //end big if
        } //end for
        if ($timer) {
            crawlLog("  Get Page Content time ".
                (changeInMicrotime($start_time)));
        }
        if (microtime(true) - $handler_time > C\PAGE_TIMEOUT) {
            if (!empty($agent_handler)) {
                curl_multi_close($agent_handler);
            }
            $agent_handler = null;
        }
        return $sites;
    }
    /**
     * Curl requests are typically done using cache data which is stored
     * after ### at the end of urls if this is possible. To make this
     * work. The http Host: with the url is added a header after the
     * for the curl request. The job of this function is to do this replace
     * @param string $url site to download with ip address at end potentially
     *  afte ###
     * @param bool $minimal don't try to do replacement, but do add an Expect
     *     header
     * @param array $proxy_servers if not empty an array of proxy servers
     *     used to crawl through
     * @return array 3-tuple (orig url, url with replacement, http header array)
     */
    public static function prepareUrlHeaders($url, $minimal = false,
        $proxy_servers = [])
    {
        $url = str_replace("&amp;", "&", $url);
        $is_gopher = false;
        if (substr($url, 0, 6) == "gopher") {
            $is_gopher = true;
        }
        /*Check if an ETag was added by the queue server. If found, create
          If-None_Match header with the ETag and add it to the headers. Remove
          ETag from URL
         */
        $if_none_match = "If-None-Match";
        $etag = null;
        if (C\USE_ETAG_EXPIRES && stristr($url, "ETag:")) {
            $etag_parts = preg_split("/ETag\:/i", $url);
            $etag_data = explode(" ", $etag_parts[1]);
            $etag = $etag_data[1];
            $pos = strrpos($url, "ETag:");
            $url = substr_replace($url, "", $pos, strlen("ETag: ".$etag));
        }
        /* in queue_server we added the ip (if available)
          after the url followed by ###
         */
        $headers = [];
        if (!$minimal) {
            $url_ip_parts = explode("###", $url);
            if ($proxy_servers != [] || (isset($url_ip_parts[0]) &&
                (stripos($url_ip_parts[0],'.onion') !== false)) ) {
                $url_ip_parts = [$url_ip_parts[0]];
                $url = $url_ip_parts[0];
            }
            if (count($url_ip_parts) > 1) {
                $ip_address = ltrim(urldecode(array_pop($url_ip_parts)), "#");
                $len = strlen(inet_pton($ip_address));
                if ($len == 4 || $len == 16) {
                    if ($len == 16) {
                      $ip_address= "[$ip_address]";
                    }
                    if (count($url_ip_parts) > 1) {
                        $url = implode("###", $url_ip_parts);
                    } else {
                        $url = $url_ip_parts[0];
                    }
                    $url_parts = @parse_url($url);
                    if (isset($url_parts['host'])) {
                        $cnt = 1;
                        $url_with_ip_if_possible =
                            str_replace($url_parts['host'], $ip_address ,$url,
                                 $cnt);
                        if ($cnt != 1) {
                            $url_with_ip_if_possible = $url;
                        } else {
                            $headers[] = "Host:".$url_parts['host'];
                        }
                    }
                } else {
                    $url_with_ip_if_possible = $url;
                }
            } else {
                $url_with_ip_if_possible = $url;
            }
        } else {
            $url_with_ip_if_possible = $url;
        }
        $headers[] = 'Expect:';
        if (C\USE_ETAG_EXPIRES && $etag !== null) {
            $etag_header = $if_none_match.": ".$etag;
            $headers[] = $etag_header;
        }
        if ($is_gopher) {
            $headers = "gopher";
        }
        $results = [$url, $url_with_ip_if_possible, $headers];
        return $results;
    }
    /**
     * Computes a hash of a string containing page data for use in
     * deduplication of pages with similar content
     *
     * @param string& $page reference to web page data
     * @return string 8 byte hash to identify page contents
     */
    public static function computePageHash(&$page)
    {
        /* to do dedup we strip script, noscript, and style tags
           as well as their content, then we strip tags, get rid
           of whitespace and hash
         */
        $strip_array =
            ['@<script[^>]*?>.*?</script>@si',
                '@<noscript[^>]*?>.*?</noscript>@si',
                '@<style[^>]*?>.*?</style>@si'];
        $dedup_string = preg_replace(
            $strip_array, '', $page);
        $dedup_string_old = preg_replace(
            '/\W+/', '', $dedup_string);
        $dedup_string = strip_tags($dedup_string_old);
        if ($dedup_string == "") {
            $dedup_string = $dedup_string_old;
        }
        $dedup_string = preg_replace(
            '/\W+/', '', $dedup_string);
        return crawlHash($dedup_string, true);
    }
    /**
     * Splits an http response document into the http headers sent
     * and the web page returned. Parses out useful information from
     * the header and return an array of these two parts and the useful info.
     *
     * @param string $header_and_page string of downloaded data
     * @param string $value field to store the page portion of page
     * @return array info array consisting of a header, page for an http
     *     response, as well as parsed from the header the server, server
     *     version, operating system, encoding, and date information.
     */
    public static function parseHeaderPage($header_and_page,
        $value=CrawlConstants::PAGE)
    {
        $cache_page_validators = [];
        $cache_page_validators['etag'] = -1;
        $cache_page_validators['expires'] = -1;
        $new_offset = 0;
        // header will include all redirect headers
        $site = [];
        $site[CrawlConstants::LOCATION] = [];
        do {
            $continue = false;
            $CRLFCRLF = strpos($header_and_page, "\x0D\x0A\x0D\x0A",
                $new_offset);
            $LFLF = strpos($header_and_page, "\x0A\x0A", $new_offset);
            //either two CRLF (what spec says) or two LF's to be safe
            $old_offset = $new_offset;
            $header_offset = ($CRLFCRLF > 0) ? $CRLFCRLF : $LFLF;
            $header_offset = ($header_offset) ? $header_offset : 0;
            $new_offset = ($CRLFCRLF > 0) ? $header_offset + 4
                : $header_offset + 2;
            $redirect_pos = stripos($header_and_page, 'Location:', $old_offset);
            $redirect_str = "Location:";
            if ($redirect_pos === false) {
                $redirect_pos =
                    stripos($header_and_page, 'Refresh:', $old_offset);
                $redirect_str = "Refresh:";
            }
            if (isset($header_and_page[$redirect_pos - 1]) &&
                ord($header_and_page[$redirect_pos - 1]) > 32) {
                $redirect_pos = $new_offset; //ignore X-XRDS-Location header
            } else if ($redirect_pos !== false && $redirect_pos < $new_offset){
                $redirect_pos += strlen($redirect_str);
                $pre_line = substr($header_and_page, $redirect_pos,
                    strpos($header_and_page, "\n", $redirect_pos) -
                    $redirect_pos);
                $loc = @trim($pre_line);
                if (strlen($loc) > 0) {
                    $site[CrawlConstants::LOCATION][] = @$loc;
                }
                $continue = true;
            }
        } while($continue);
        if ($header_offset > 0) {
            $site[CrawlConstants::HEADER] =
                substr($header_and_page, 0, $header_offset);
            $site[$value] = ltrim(substr($header_and_page, $header_offset));
        } else { //header message no body; maybe 301?
            $site[CrawlConstants::HEADER] = $header_and_page;
            $site[$value] = " ";
        }
        $lines = explode("\n", $site[CrawlConstants::HEADER]);
        $first_line = array_shift($lines);
        $response = preg_split("/(\s+)/", $first_line);
        $site[CrawlConstants::HTTP_CODE] = isset($response[1]) ?
             @trim($response[1]) : 0;
        $site[CrawlConstants::ROBOT_METAS] = [];
        foreach ($lines as $line) {
            $line = trim($line);
            if (stristr($line, 'Server:')) {
                $server_parts = preg_split("/Server\:/i", $line);
                $server_name_parts = @explode("/", $server_parts[1]);
                $site[CrawlConstants::SERVER] = @trim($server_name_parts[0]);
                if (isset($server_name_parts[1])) {
                    $version_parts = explode("(", $server_name_parts[1]);
                    $site[CrawlConstants::SERVER_VERSION] =
                        @trim($version_parts[0]);
                    if (isset($version_parts[1])) {
                        $os_parts = explode(")", $version_parts[1]);
                        $site[CrawlConstants::OPERATING_SYSTEM] =
                            @trim($os_parts[0]);
                    }
                }
            }
            if (stristr($line, 'Content-type:')) {
                list(,$mimetype,) = preg_split("/:|;/i", $line);
                $site[CrawlConstants::TYPE] = trim($mimetype);
            }
            if (stristr($line, 'charset=')) {
                $line_parts = preg_split("/charset\=/i", $line);
                $site[CrawlConstants::ENCODING] =
                    strtoupper(@trim($line_parts[1]));
            }
            if (stristr($line, 'Last-Modified:')) {
                $line_parts = preg_split("/Last\-Modified\:/i", $line);
                $site[CrawlConstants::MODIFIED] =
                    strtotime(@trim($line_parts[1]));
            }
            if (stristr($line, 'X-Robots-Tag:')) { // robot directives pdfs etc
                $line_parts = preg_split("/X\-Robots\-Tag\:/i", $line);
                $robot_metas = explode(",", $line_parts[1]);
                foreach ($robot_metas as $robot_meta) {
                    $site[CrawlConstants::ROBOT_METAS][] = strtoupper(
                        trim($robot_meta));
                }
            }
            if (stristr($line, 'Content-Range:')) {
                $line_parts = explode("/", $line);
                if (!empty($line_parts[1])) {
                    $content_size = intval(trim($line_parts[1]));
                    if ($content_size > 0) {
                        $site[CrawlConstants::CONTENT_SIZE] = $content_size;
                    }
                }
            }
            $canonical_regex = "/Link\:\s*\<\s*(http.*)\s*\>\s*\;\s*".
                "rel\s*\=\s*(\"|')?canonical(\"|')?/";
            // levenshtein gives notices on strings longer than 255
            if (preg_match($canonical_regex, $line, $matches) &&
                isset($site[CrawlConstants::URL]) && strlen($matches[1]) < 252
                && (strlen($site[CrawlConstants::URL]) >= 255 ||
                levenshtein($matches[1], $site[CrawlConstants::URL]) > 3)) {
                // for rel canonical headers
                $site[CrawlConstants::LOCATION][] = $matches[1];
                $site[CrawlConstants::ROBOT_METAS][] = 'NOFOLLOW';
            }
            if (C\USE_ETAG_EXPIRES && stristr($line, 'ETag:')) {
                $line_parts = preg_split("/ETag\:/i", $line);
                if (isset($line_parts[1])) {
                    $etag_data = explode(" ", $line_parts[1]);
                    if (isset($etag_data[1])) {
                        $etag = $etag_data[1];
                        $cache_page_validators['etag'] = $etag;
                    }
                }
            }
            if (C\USE_ETAG_EXPIRES && stristr($line, 'Expires:')) {
                $line_parts = preg_split("/Expires\:/i", $line);
                $all_dates = $line_parts[1];
                $date_parts = explode(",", $all_dates);
                if (count($date_parts) == 2) {
                    $cache_page_validators['expires'] = strtotime(
                        $date_parts[1]);
                } else if (count($date_parts) > 2) {
                    /*Encountered some pages with more than one Expires date
                      :O */
                    $timestamps = [];
                    for ($i = 1;$i < count($date_parts); $i += 2) {
                        $ds = strtotime($date_parts[$i]);
                        $timestamps[] = $ds;
                    }
                    $lowest = min($timestamps);
                    $cache_page_validators['expires'] = $lowest;
                }
            }
            if (C\USE_ETAG_EXPIRES && !($cache_page_validators['etag'] == -1 &&
                $cache_page_validators['expires'] == -1)) {
                $site[CrawlConstants::CACHE_PAGE_VALIDATORS] =
                    $cache_page_validators;
            }
        }
        /*
           If the doc is HTML and it uses a http-equiv to set the encoding
           then we override what the server says (if anything). As we
           are going to convert to UTF-8 we remove the charset info
           from the meta tag so cached pages will display correctly and
           redirects without char encoding won't be given a different hash.
         */
        $encoding_info = guessEncodingHtml($site[$value], true);
        if (is_array($encoding_info)) {
            list($site[CrawlConstants::ENCODING], $start_charset, $len_c) =
            $encoding_info;
            $site[$value] = substr_replace($site[$value], "", $start_charset,
                $len_c);
        } else {
            $site[CrawlConstants::ENCODING] = $encoding_info;
        }
        if (!isset($site[CrawlConstants::SERVER]) ) {
            $site[CrawlConstants::SERVER] = "unknown";
        }
        return $site;
    }
    /**
     * Computes the IP address from http get-responser header
     *
     * @param string $header contains complete transcript of HTTP get/response
     * @return string IPv4 address as a string of dot separated quads.
     */
    public static function getCurlIp($header)
    {
        if (preg_match_all('/Trying\s+(.*)(\.\.\.)/',
            $header, $matches)) {
            $out_addresses = [];
            $addresses = array_unique($matches[1]);
            foreach ($addresses as $address) {
                $num = @inet_pton($address);
                if ($num !== false) {
                    $out_addresses[] = $address;
                }
            }
            if ($out_addresses != []) {
                return $out_addresses;
            }
            return false;
        } else {
            return false;
        }
    }
    /**
     * Make a curl request for the provided url
     *
     * @param string $site url of page to request
     * @param array $post_data  any data to be POST'd to the URL
     * @param bool $check_for_errors whether or not to check the response
     *      for the words, NOTICE, WARNING, FATAL which might indicate an
     *      error on the server
     * @param string $user_password username:password to use for connection if
     *      needed (optional)
     * @return string the contents of what the curl request fetched
     */
    public static function getPage($site, $post_data = null,
        $check_for_errors = false, $user_password = null)
    {
        static $agents = [];
        $not_web_setting = (php_sapi_name() == 'cli' &&
            !C\nsdefined("IS_OWN_WEB_SERVER"));
        $MAX_SIZE = 50;
        $host = @parse_url($site, PHP_URL_HOST);
        $local_hosts = ['localhost', '::1', '0.0.0.0',
            '127.0.0.1'];
        if (!empty($_SERVER['SERVER_NAME'])) {
            $local_hosts[] = $_SERVER['SERVER_NAME'];
            if (substr($_SERVER['SERVER_NAME'], 0, 3) != "www") {
                $local_hosts[] = "www." . $_SERVER['SERVER_NAME'];
            }
        }
        if ($host !== false) {
            if (C\nsdefined("IS_OWN_WEB_SERVER") &&
                in_array($host, $local_hosts)) {
                $port = @parse_url($site, PHP_URL_PORT);
                if (empty($port)) {
                    $port = $_SERVER['SERVER_PORT'];
                }
                if ($port == $_SERVER['SERVER_PORT']) {
                    if (empty($GLOBALS['web_site'])) {
                        return "";
                    }
                    $web_site = $GLOBALS['web_site'];
                    $output = $web_site->processInternalRequest($site, false,
                        $post_data);
                    return $output;
                }
            }
            if (count($agents) > $MAX_SIZE) {
                $agent_host = array_shift($agents);
                if ($agent_host) {
                    curl_close($agent_host);
                }
            }
            if (empty($agents[$host])) {
                $agents[$host] = curl_init();
            }
        }
        if ($not_web_setting) {
            crawlLog("  Init curl request of a single page");
        }
        curl_setopt($agents[$host], CURLOPT_USERAGENT, C\USER_AGENT);
        curl_setopt($agents[$host], CURLOPT_URL, $site);
        curl_setopt($agents[$host], CURLOPT_AUTOREFERER, true);
        curl_setopt($agents[$host], CURLOPT_FOLLOWLOCATION, true);
        // these next two lines should probably be modified for better security
        curl_setopt($agents[$host], CURLOPT_HTTP_VERSION,
            CURL_HTTP_VERSION_2_0);
        curl_setopt($agents[$host], CURLOPT_SSL_VERIFYHOST, 0);
        curl_setopt($agents[$host], CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($agents[$host], CURLOPT_NOSIGNAL, true);
        curl_setopt($agents[$host], CURLOPT_RETURNTRANSFER, true);
        curl_setopt($agents[$host], CURLOPT_FAILONERROR, true);
        curl_setopt($agents[$host], CURLOPT_TIMEOUT, C\SINGLE_PAGE_TIMEOUT);
        curl_setopt($agents[$host], CURLOPT_CONNECTTIMEOUT, C\PAGE_TIMEOUT);
        //make lighttpd happier
        curl_setopt($agents[$host], CURLOPT_HTTPHEADER, ['Expect:']);
        if ($post_data != null) {
            curl_setopt($agents[$host], CURLOPT_POST, true);
            curl_setopt($agents[$host], CURLOPT_POSTFIELDS, $post_data);
        } else {
            // since we are caching agents, need to do this so doesn't get stuck
            // as post and so query string ignored for get's
            curl_setopt($agents[$host], CURLOPT_HTTPGET, true);
        }
        if ($user_password != null) {
            curl_setopt($agents[$host], CURLOPT_FAILONERROR, false);
            curl_setopt($agents[$host], CURLOPT_USERPWD, $user_password);
            curl_setopt($agents[$host], CURLOPT_SSL_VERIFYHOST, 2);
            curl_setopt($agents[$host], CURLOPT_SSLVERSION,
                CURL_SSLVERSION_TLSv1_2);
        }
        if ($not_web_setting) {
            crawlLog("  Set curl options for single page request");
        }
        $time = time();
        $response = curl_exec($agents[$host]);
        if (time() - $time > C\PAGE_TIMEOUT && $not_web_setting) {
            crawlLog("  Request took longer than page timeout!!");
            crawlLog("  Either could not reach URL or website took too");
            crawlLog("  long to respond.");
        }
        curl_setopt($agents[$host], CURLOPT_POSTFIELDS, "");
        if ($not_web_setting) {
            crawlLog("  Done curl exec");
        }
        if ($not_web_setting && $check_for_errors) {
            self::checkResponseForErrors($response);
        }
        return $response;
    }
    /**
     * Given the results of a getPage call, check whether or not the response
     * had the words NOTICE, WARNING, FATAL which might indicate an error on
     * the server. If it does, then the $response string is sent to the
     * crawlLog
     *
     * @param string $response getPage response in which to check for errors
     */
    public static function checkResponseForErrors($response)
    {
        if (preg_match("/NOTICE|WARNING|FATAL/i", $response)) {
            crawlLog("There appears to have been an error in the server ".
                "response. Response was:");
            crawlLog(wordwrap($response));
        }
    }
}

ViewGit