Source for file fetch_url.php
Documentation is available at fetch_url.php
* Open Source Pure PHP Search Engine, Crawler, and Indexer
* Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* @author Chris Pollett chris@pollett.org
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
* Reads in constants used as enums used for storing web sites
require_once BASE_DIR. "/lib/crawl_constants.php";
* Code used to manage HTTP requests from one or more URLS
class FetchUrl implements CrawlConstants
* Make multi_curl requests for an array of sites with urls
* @param array $sites an array containing urls of pages to request
* @param bool $timer flag, true means print timing statistics to log
* @param int $page_range_request maximum number of bytes to download/page
* @param string $temp_dir folder to store temporary ip header info
* @param string $key the component of $sites[$i] that has the value of
* a url to get defaults to URL
* @param string $value component of $sites[$i] in which to store the
* @param bool $minimal if true do a faster request of pages by not
* doing things like extract HTTP headers sent, etcs
* @param array $post_data data to be POST'd to each site
* @param bool $follow whether to follow redirects or not
* @return array an updated array with the contents of those pages
static function getPages($sites, $timer = false,
$page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL,
$key= CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal= false,
$post_data = NULL, $follow = false)
if(!$minimal && $temp_dir == NULL) {
$num_sites = count($sites);
for($i = 0; $i < $num_sites; $i++ ) {
if(isset ($sites[$i][$key])) {
list ($sites[$i][$key], $url, $headers) =
self::prepareUrlHeaders($sites[$i][$key], $minimal);
$ip_holder[$i] = fopen("$temp_dir/tmp$i.txt", 'w+');
curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
$follow = true; /*wikipedia redirects their robot page. grr
want to force this for robots pages
curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
// ^ need to set for sites like att that use gzip
if($page_range_request > 0) {
for($i = 0; $i < $num_sites; $i++ ) {
if(!$minimal && isset ($ip_holder[$i]) ) {
$header = fread($ip_holder[$i], 8192);
$ip_addresses = self::getCurlIp($header);
if(isset ($sites[$i][0]) && $sites[$i][0]) {
// Get Data and Message Code
If the Transfer-encoding was chunked then the Range header
we sent was ignored. So we manually truncate the data
if($page_range_request > 0) {
$content = substr($content, 0, $page_range_request);
if(isset ($content) && !$minimal) {
$site = self::parseHeaderPage($content, $value);
strpos($header, "\x0D\x0A\x0D\x0A") + 4);
$sites[$i][$value] = $content;
CURLINFO_NAMELOOKUP_TIME);
$sites[$i][self::HTTP_CODE] =
if(!$sites[$i][self::HTTP_CODE]) {
$sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]);
$sites[$i][self::IP_ADDRESSES] = $ip_addresses;
$sites[$i][self::IP_ADDRESSES] = array("0.0.0.0");
//Get Time, Mime type and Character encoding
$sites[$i][self::TIMESTAMP] = time();
// curl_close($sites[$i][0]);
/* in queue_server we added the ip (if available)
after the url followed by ###
$url_ip_parts = explode("###", $url);
if(count($url_ip_parts) > 1) {
if($len == 4 || $len == 16) {
$ip_address= "[$ip_address]";
if(count($url_ip_parts) > 1) {
$url = implode("###", $url_ip_parts);
if(isset ($url_parts['host'])) {
$url_with_ip_if_possible =
$url_with_ip_if_possible = $url;
$headers[] = "Host:". $url_parts['host'];
$url_with_ip_if_possible = $url;
$url_with_ip_if_possible = $url;
$url_with_ip_if_possible = $url;
$results = array($url, $url_with_ip_if_possible, $headers);
* Computes a hash of a string containing page data for use in
* deduplication of pages with similar content
* @param string &$page web page data
* @return string 8 byte hash to identify page contents
/* to do dedup we strip script, noscript, and style tags
as well as their content, then we strip tags, get rid
array('@<script[^>]*?>.*?</script>@si',
'@<noscript[^>]*?>.*?</noscript>@si',
'@<style[^>]*?>.*?</style>@si');
$strip_array, '', $page);
'/\W+/', '', $dedup_string);
if($dedup_string == "") {
$dedup_string = $dedup_string_old;
'/\W+/', '', $dedup_string);
* Splits an http response document into the http headers sent
* and the web page returned. Parses out useful information from
* the header and return an array of these two parts and the useful info.
* @param string &$header_and_page reference to string of downloaded data
* @param string $value field to store the page portion of page
* @return array info array consisting of a header, page for an http
* response, as well as parsed from the header the server, server
* version, operating system, encoding, and date information.
$value= CrawlConstants::PAGE)
// header will include all redirect headers
$CRLFCRLF = strpos($header_and_page, "\x0D\x0A\x0D\x0A",
$LFLF = strpos($header_and_page, "\x0A\x0A", $new_offset);
//either two CRLF (what spec says) or two LF's to be safe
$old_offset = $new_offset;
$header_offset = ($CRLFCRLF > 0) ? $CRLFCRLF : $LFLF;
$header_offset = ($header_offset) ? $header_offset : 0;
$new_offset = ($CRLFCRLF > 0) ? $header_offset + 4
$redirect_pos = stripos($header_and_page, 'Location:', $old_offset);
$redirect_str = "Location:";
if($redirect_pos === false) {
stripos($header_and_page, 'Refresh:', $old_offset);
$redirect_str = "Refresh:";
if(isset ($header_and_page[$redirect_pos - 1]) &&
ord($header_and_page[$redirect_pos - 1]) > 32) {
$redirect_pos = $new_offset; //ignore X-XRDS-Location header
} else if($redirect_pos !== false && $redirect_pos < $new_offset){
$redirect_pos += strlen($redirect_str);
$pre_line = substr($header_and_page, $redirect_pos,
strpos($header_and_page, "\n", $redirect_pos) -
substr($header_and_page, 0, $header_offset);
$site[$value] = ltrim(substr($header_and_page, $header_offset));
} else { //header message no body; maybe 301?
foreach($lines as $line) {
$server_name_parts = @explode("/", $server_parts[1]);
if(isset ($server_name_parts[1])) {
$version_parts = explode("(", $server_name_parts[1]);
@trim($version_parts[0]);
if(isset ($version_parts[1])) {
$os_parts = explode(")", $version_parts[1]);
if(stristr($line, 'Content-type:')) {
if(stristr($line, 'Last-Modified:')) {
$line_parts = preg_split("/Last\-Modified\:/i", $line);
if(stristr($line, 'X-Robots-Tag:')) {
$line_parts = preg_split("/X\-Robots\-Tag\:/i", $line);
$robot_metas = explode(",", $line_parts[1]);
foreach($robot_metas as $robot_meta) {
If the doc is HTML and it uses a http-equiv to set the encoding
then we override what the server says (if anything). As we
are going to convert to UTF-8 we remove the charset info
from the meta tag so cached pages will display correctly and
redirects without char encoding won't be given a different hash.
$end_head = stripos($site[$value], "</head");
$reg = "charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?";
if(($match[6] == "'" || $match[6] == '"') &&
$match[3] != $match[6]) {
$start_charset = strpos($site[$value], $match[0]);
if($start_charset + $len_c < $end_head) {
$site[$value], "", $start_charset,
//else fallback to auto-detect
* Computes the IP address from http get-responser header
* @param string contains complete transcript of HTTP get/response
* @return string IPv4 address as a string of dot separated quads.
if (preg_match_all('/Trying\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/',
* Make a curl request for the provide url
* @param string $site url of page to request
* @param string $post_data any data to be POST'd to the URL
* @return string the contents of what the curl request fetched
static function getPage($site, $post_data = NULL)
static $agents = array();
$host = @parse_url($site,PHP_URL_HOST);
if(count($agents) > $MAX_SIZE) {
if(!isset ($agents[$host])) {
crawlLog(" Init curl request of a single page");
curl_setopt($agents[$host], CURLOPT_AUTOREFERER, true);
curl_setopt($agents[$host], CURLOPT_FOLLOWLOCATION, true);
curl_setopt($agents[$host], CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($agents[$host], CURLOPT_RETURNTRANSFER, true);
curl_setopt($agents[$host], CURLOPT_FAILONERROR, true);
curl_setopt($agents[$host], CURLOPT_HTTPHEADER, array('Expect:'));
curl_setopt($agents[$host], CURLOPT_POSTFIELDS, $post_data);
// since we are caching agents, need to do this so doesn't get stuck
// as post and so query string ignored for get's
crawlLog(" Set curl options for single page request");
crawlLog(" Request took longer than page timeout!!");
crawlLog(" Either could not reach URL or website took too");
|