seek_quarry
[ class tree: seek_quarry ] [ index: seek_quarry ] [ all elements ]

Source for file fetch_url.php

Documentation is available at fetch_url.php

  1. <?php
  2. /**
  3.  *  SeekQuarry/Yioop --
  4.  *  Open Source Pure PHP Search Engine, Crawler, and Indexer
  5.  *
  6.  *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
  7.  *
  8.  *  LICENSE:
  9.  *
  10.  *  This program is free software: you can redistribute it and/or modify
  11.  *  it under the terms of the GNU General Public License as published by
  12.  *  the Free Software Foundation, either version 3 of the License, or
  13.  *  (at your option) any later version.
  14.  *
  15.  *  This program is distributed in the hope that it will be useful,
  16.  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17.  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18.  *  GNU General Public License for more details.
  19.  *
  20.  *  You should have received a copy of the GNU General Public License
  21.  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  22.  *
  23.  *  END LICENSE
  24.  *
  25.  * @author Chris Pollett chris@pollett.org
  26.  * @package seek_quarry
  27.  * @subpackage library
  28.  * @license http://www.gnu.org/licenses/ GPL3
  29.  * @link http://www.seekquarry.com/
  30.  * @copyright 2009 - 2013
  31.  * @filesource
  32.  */
  33.  
  34. if(!defined('BASE_DIR')) {echo "BAD REQUEST"exit();}
  35.  
  36. /**
  37.  * Reads in constants used as enums used for storing web sites
  38.  */
  39. require_once BASE_DIR."/lib/crawl_constants.php";
  40.  
  41. /**
  42.  *
  43.  * Code used to manage HTTP requests from one or more URLS
  44.  *
  45.  * @author Chris Pollett
  46.  *
  47.  * @package seek_quarry
  48.  * @subpackage library
  49.  */
  50. class FetchUrl implements CrawlConstants
  51. {
  52.  
  53.     /**
  54.      * Make multi_curl requests for an array of sites with urls
  55.      *
  56.      * @param array $sites  an array containing urls of pages to request
  57.      * @param bool $timer  flag, true means print timing statistics to log
  58.      * @param int $page_range_request maximum number of bytes to download/page
  59.      *       0 means download all
  60.      * @param string $temp_dir folder to store temporary ip header info
  61.      * @param string $key  the component of $sites[$i] that has the value of
  62.      *       a url to get defaults to URL
  63.      * @param string $value component of $sites[$i] in which to store the
  64.      *       page that was gotten
  65.      * @param bool $minimal if true do a faster request of pages by not
  66.      *       doing things like extract HTTP headers sent, etcs
  67.      * @param array $post_data data to be POST'd to each site
  68.      * @param bool $follow whether to follow redirects or not
  69.      *
  70.      * @return array an updated array with the contents of those pages
  71.      */
  72.  
  73.     static function getPages($sites$timer false,
  74.         $page_range_request PAGE_RANGE_REQUEST$temp_dir NULL,
  75.         $key=CrawlConstants::URL$value CrawlConstants::PAGE$minimal=false,
  76.         $post_data NULL$follow false)
  77.     {
  78.         $agent_handler curl_multi_init();
  79.  
  80.         $active NULL;
  81.  
  82.         $start_time microtime();
  83.  
  84.         if(!$minimal && $temp_dir == NULL{
  85.             $temp_dir CRAWL_DIR."/temp";
  86.             if(!file_exists($temp_dir)) {
  87.                 mkdir($temp_dir);
  88.             }
  89.         }
  90.  
  91.         //Set-up requests
  92.         $num_sites count($sites);
  93.         for($i 0$i $num_sites$i++{
  94.             if(isset($sites[$i][$key])) {
  95.                 list($sites[$i][$key]$url$headers=
  96.                     self::prepareUrlHeaders($sites[$i][$key]$minimal);
  97.                 $sites[$i][0curl_init();
  98.                 if(!$minimal{
  99.                     $ip_holder[$ifopen("$temp_dir/tmp$i.txt"'w+');
  100.                     curl_setopt($sites[$i][0]CURLOPT_STDERR$ip_holder[$i]);
  101.                     curl_setopt($sites[$i][0]CURLOPT_VERBOSEtrue);
  102.                 }
  103.                 curl_setopt($sites[$i][0]CURLOPT_USERAGENTUSER_AGENT);
  104.                 curl_setopt($sites[$i][0]CURLOPT_URL$url);
  105.                 if(strcmp(substr($url,-10)"robots.txt"== {
  106.                     $follow true/*wikipedia redirects their robot page. grr
  107.                                       want to force this for robots pages
  108.                                     */
  109.                 }
  110.                 curl_setopt($sites[$i][0]CURLOPT_FOLLOWLOCATION$follow);
  111.                 curl_setopt($sites[$i][0]CURLOPT_SSL_VERIFYHOST0);
  112.                 curl_setopt($sites[$i][0]CURLOPT_AUTOREFERERtrue);
  113.                 curl_setopt($sites[$i][0]CURLOPT_RETURNTRANSFERtrue);
  114.                 curl_setopt($sites[$i][0]CURLOPT_CONNECTTIMEOUT,PAGE_TIMEOUT);
  115.                 curl_setopt($sites[$i][0]CURLOPT_TIMEOUTPAGE_TIMEOUT);
  116.                 if(!$minimal{
  117.                     curl_setopt($sites[$i][0]CURLOPT_HEADERtrue);
  118.                 }
  119.                 //make lighttpd happier
  120.                 curl_setopt($sites[$i][0]CURLOPT_HTTPHEADER,
  121.                     $headers);
  122.                 curl_setopt($sites[$i][0]CURLOPT_ENCODING"");
  123.                    // ^ need to set for sites like att that use gzip
  124.                 if($page_range_request 0{
  125.                     curl_setopt($sites[$i][0]CURLOPT_RANGE"0-".
  126.                         $page_range_request);
  127.                 }
  128.                 if($post_data != NULL{
  129.                     curl_setopt($sites[$i][0]CURLOPT_POSTtrue);
  130.                     curl_setopt($sites[$i][0]CURLOPT_POSTFIELDS,
  131.                         $post_data[$i]);
  132.                 }
  133.                 curl_multi_add_handle($agent_handler$sites[$i][0]);
  134.             }
  135.         }
  136.         if($timer{
  137.             crawlLog("  Init Get Pages ".(changeInMicrotime($start_time)));
  138.         }
  139.         $start_time microtime();
  140.         $start time();
  141.  
  142.         //Wait for responses
  143.         $running=null;
  144.         $memory_limit metricToInt(ini_get("memory_limit")) 0.7;
  145.         do {
  146.             $mrc curl_multi_exec($agent_handler$running);
  147.             $ready=curl_multi_select($agent_handler0.005);
  148.         while (memory_get_usage($memory_limit &&
  149.             time($start PAGE_TIMEOUT &&  $running );
  150.  
  151.         if(time($start PAGE_TIMEOUT{crawlLog("  TIMED OUT!!!");}
  152.  
  153.         if($timer{
  154.             crawlLog("  Page Request time ".(changeInMicrotime($start_time)));
  155.         }
  156.         $start_time microtime();
  157.  
  158.         //Process returned pages
  159.         for($i 0$i $num_sites$i++{
  160.             if(!$minimal && isset($ip_holder[$i]) ) {
  161.                 rewind($ip_holder[$i]);
  162.                 $header fread($ip_holder[$i]8192);
  163.                 $ip_addresses self::getCurlIp($header);
  164.                 fclose($ip_holder[$i]);
  165.             }
  166.             if(isset($sites[$i][0]&& $sites[$i][0]{
  167.                 // Get Data and Message Code
  168.                 $content @curl_multi_getcontent($sites[$i][0]);
  169.                 /*
  170.                     If the Transfer-encoding was chunked then the Range header
  171.                     we sent was ignored. So we manually truncate the data
  172.                     here
  173.                  */
  174.                 if($page_range_request 0{
  175.                     $content substr($content0$page_range_request);
  176.                 }
  177.                 if(isset($content&& !$minimal{
  178.                     $site self::parseHeaderPage($content$value);
  179.                     $sites[$iarray_merge($sites[$i]$site);
  180.                     if(isset($header)) {
  181.                         $header substr($header0,
  182.                             strpos($header"\x0D\x0A\x0D\x0A"4);
  183.                     else {
  184.                         $header "";
  185.                     }
  186.                     $sites[$i][CrawlConstants::HEADER=
  187.                         $header $sites[$i][CrawlConstants::HEADER];
  188.                     unset($header);
  189.                 else {
  190.                     $sites[$i][$value$content;
  191.                 }
  192.                 if(!$minimal{
  193.                     $sites[$i][self::SIZE@curl_getinfo($sites[$i][0],
  194.                         CURLINFO_SIZE_DOWNLOAD);
  195.                     $sites[$i][self::DNS_TIME@curl_getinfo($sites[$i][0],
  196.                         CURLINFO_NAMELOOKUP_TIME);
  197.                     $sites[$i][self::TOTAL_TIME@curl_getinfo($sites[$i][0],
  198.                         CURLINFO_TOTAL_TIME);
  199.                     $sites[$i][self::HTTP_CODE=
  200.                         curl_getinfo($sites[$i][0]CURLINFO_HTTP_CODE);
  201.                     if(!$sites[$i][self::HTTP_CODE]{
  202.                         $sites[$i][self::HTTP_CODEcurl_error($sites[$i][0]);
  203.                     }
  204.                     if($ip_addresses{
  205.                         $sites[$i][self::IP_ADDRESSES$ip_addresses;
  206.                     else {
  207.                         $sites[$i][self::IP_ADDRESSESarray("0.0.0.0");
  208.                     }
  209.  
  210.                     //Get Time, Mime type and Character encoding
  211.                     $sites[$i][self::TIMESTAMPtime();
  212.  
  213.                     $type_parts =
  214.                         explode(";"curl_getinfo($sites[$i][0],
  215.                             CURLINFO_CONTENT_TYPE));
  216.  
  217.                     $sites[$i][self::TYPEstrtolower(trim($type_parts[0]));
  218.                 }
  219.  
  220.                 curl_multi_remove_handle($agent_handler$sites[$i][0]);
  221.                 // curl_close($sites[$i][0]);
  222.             //end big if
  223.  
  224.         //end for
  225.         if($timer{
  226.             crawlLog("  Get Page Content time ".
  227.                 (changeInMicrotime($start_time)));
  228.         }
  229.         curl_multi_close($agent_handler);
  230.         return $sites;
  231.     }
  232.  
  233.     /**
  234.      *
  235.      * @param string $url 
  236.      * @param bool $minimal 
  237.      */
  238.     static function prepareUrlHeaders($url$minimal false)
  239.     {
  240.         $url str_replace("&amp;""&"$url);
  241.         /* in queue_server we added the ip (if available)
  242.           after the url followed by ###
  243.          */
  244.         $headers array();
  245.         if(!$minimal{
  246.             $url_ip_parts explode("###"$url);
  247.             if(count($url_ip_parts1{
  248.                 $ip_address urldecode(array_pop($url_ip_parts));
  249.                 $len strlen(inet_pton($ip_address));
  250.                 if($len == || $len == 16{
  251.                     if($len == 16{
  252.                         $ip_address"[$ip_address]";
  253.                     }
  254.                     if(count($url_ip_parts1{
  255.                         $url implode("###"$url_ip_parts);
  256.                     else {
  257.                         $url $url_ip_parts[0];
  258.                     }
  259.                     $url_parts @parse_url($url);
  260.                     if(isset($url_parts['host'])) {
  261.                         $cnt 1;
  262.                         $url_with_ip_if_possible =
  263.                             str_replace($url_parts['host']$ip_address ,$url,
  264.                                  $cnt);
  265.                         if($cnt != 1{
  266.                             $url_with_ip_if_possible $url;
  267.                         else {
  268.                             $headers["Host:".$url_parts['host'];
  269.                         }
  270.                     }
  271.                 else {
  272.                     $url_with_ip_if_possible $url;
  273.                 }
  274.             else {
  275.                 $url_with_ip_if_possible $url;
  276.             }
  277.         else {
  278.             $url_with_ip_if_possible $url;
  279.         }
  280.         $headers['Expect:';
  281.         $results array($url$url_with_ip_if_possible$headers);
  282.         return $results;
  283.     }
  284.  
  285.     /**
  286.      * Computes a hash of a string containing page data for use in
  287.      * deduplication of pages with similar content
  288.      *
  289.      *  @param string &$page  web page data
  290.      *  @return string 8 byte hash to identify page contents
  291.      */
  292.     static function computePageHash(&$page)
  293.     {
  294.         /* to do dedup we strip script, noscript, and style tags
  295.            as well as their content, then we strip tags, get rid
  296.            of whitespace and hash
  297.          */
  298.         $strip_array =
  299.             array('@<script[^>]*?>.*?</script>@si',
  300.                 '@<noscript[^>]*?>.*?</noscript>@si',
  301.                 '@<style[^>]*?>.*?</style>@si');
  302.         $dedup_string preg_replace(
  303.             $strip_array''$page);
  304.         $dedup_string_old preg_replace(
  305.             '/\W+/'''$dedup_string);
  306.         $dedup_string strip_tags($dedup_string_old);
  307.         if($dedup_string == ""{
  308.             $dedup_string $dedup_string_old;
  309.         }
  310.         $dedup_string preg_replace(
  311.             '/\W+/'''$dedup_string);
  312.  
  313.         return crawlHash($dedup_stringtrue);
  314.     }
  315.  
  316.     /**
  317.      *  Splits an http response document into the http headers sent
  318.      *  and the web page returned. Parses out useful information from
  319.      *  the header and return an array of these two parts and the useful info.
  320.      *
  321.      *  @param string &$header_and_page reference to string of downloaded data
  322.      *  @param string $value field to store the page portion of page
  323.      *  @return array info array consisting of a header, page for an http
  324.      *       response, as well as parsed from the header the server, server
  325.      *       version, operating system, encoding, and date information.
  326.      */
  327.     static function parseHeaderPage(&$header_and_page,
  328.         $value=CrawlConstants::PAGE)
  329.     {
  330.         $new_offset 0;
  331.         // header will include all redirect headers
  332.         $site array();
  333.         $site[CrawlConstants::LOCATIONarray();
  334.         do {
  335.             $continue false;
  336.             $CRLFCRLF strpos($header_and_page"\x0D\x0A\x0D\x0A",
  337.                 $new_offset);
  338.             $LFLF strpos($header_and_page"\x0A\x0A"$new_offset);
  339.             //either two CRLF (what spec says) or two LF's to be safe
  340.             $old_offset $new_offset;
  341.             $header_offset ($CRLFCRLF 0$CRLFCRLF $LFLF;
  342.             $header_offset ($header_offset$header_offset 0;
  343.             $new_offset ($CRLFCRLF 0$header_offset 4
  344.                 : $header_offset 2;
  345.             $redirect_pos stripos($header_and_page'Location:'$old_offset);
  346.             $redirect_str "Location:";
  347.             if($redirect_pos === false{
  348.                 $redirect_pos =
  349.                     stripos($header_and_page'Refresh:'$old_offset);
  350.                 $redirect_str "Refresh:";
  351.             }
  352.             if(isset($header_and_page[$redirect_pos 1]&&
  353.                 ord($header_and_page[$redirect_pos 1]32{
  354.                 $redirect_pos $new_offset//ignore X-XRDS-Location header
  355.             else if($redirect_pos !== false && $redirect_pos $new_offset){
  356.                 $redirect_pos += strlen($redirect_str);
  357.                 $pre_line substr($header_and_page$redirect_pos,
  358.                     strpos($header_and_page"\n"$redirect_pos-
  359.                     $redirect_pos);
  360.                 $loc @trim($pre_line);
  361.                 if(strlen($loc0{
  362.                     $site[CrawlConstants::LOCATION][@$loc;
  363.                 }
  364.                 $continue true;
  365.             }
  366.         while($continue);
  367.         if($header_offset 0{
  368.             $site[CrawlConstants::HEADER=
  369.                 substr($header_and_page0$header_offset);
  370.             $site[$valueltrim(substr($header_and_page$header_offset));
  371.         else //header message no body; maybe 301?
  372.             $site[CrawlConstants::HEADER$header_and_page;
  373.             $site[$value" ";
  374.         }
  375.         $lines explode("\n"$site[CrawlConstants::HEADER]);
  376.         $first_line array_shift($lines);
  377.         $response preg_split("/(\s+)/"$first_line);
  378.  
  379.         $site[CrawlConstants::HTTP_CODE@trim($response[1]);
  380.         $site[CrawlConstants::ROBOT_METASarray();
  381.         foreach($lines as $line{
  382.             $line trim($line);
  383.             if(stristr($line'Server:')) {
  384.                 $server_parts preg_split("/Server\:/i"$line);
  385.                 $server_name_parts @explode("/"$server_parts[1]);
  386.                 $site[CrawlConstants::SERVER@trim($server_name_parts[0]);
  387.                 if(isset($server_name_parts[1])) {
  388.                     $version_parts explode("("$server_name_parts[1]);
  389.                     $site[CrawlConstants::SERVER_VERSION=
  390.                         @trim($version_parts[0]);
  391.                     if(isset($version_parts[1])) {
  392.                         $os_parts explode(")"$version_parts[1]);
  393.                         $site[CrawlConstants::OPERATING_SYSTEM=
  394.                             @trim($os_parts[0]);
  395.                     }
  396.                 }
  397.             }
  398.             if(stristr($line'Content-type:')) {
  399.                 list(,$mimetype,preg_split("/:|;/i"$line);
  400.                 $site[CrawlConstants::TYPEtrim($mimetype);
  401.             }
  402.             if(stristr($line'charset=')) {
  403.                 $line_parts preg_split("/charset\=/i"$line);
  404.                 $site[CrawlConstants::ENCODING=
  405.                     strtoupper(@trim($line_parts[1]));
  406.             }
  407.             if(stristr($line'Last-Modified:')) {
  408.                 $line_parts preg_split("/Last\-Modified\:/i"$line);
  409.                 $site[CrawlConstants::MODIFIED=
  410.                     strtotime(@trim($line_parts[1]));
  411.             }
  412.             if(stristr($line'X-Robots-Tag:')) {
  413.                 $line_parts preg_split("/X\-Robots\-Tag\:/i"$line);
  414.                 $robot_metas explode(","$line_parts[1]);
  415.                 foreach($robot_metas as $robot_meta{
  416.                     $site[CrawlConstants::ROBOT_METAS][strtoupper(
  417.                         trim($robot_meta));
  418.                 }
  419.             }
  420.         }
  421.         /*
  422.            If the doc is HTML and it uses a http-equiv to set the encoding
  423.            then we override what the server says (if anything). As we
  424.            are going to convert to UTF-8 we remove the charset info
  425.            from the meta tag so cached pages will display correctly and
  426.            redirects without char encoding won't be given a different hash.
  427.          */
  428.         $end_head stripos($site[$value]"</head");
  429.         if($end_head{
  430.             $reg "charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?";
  431.             mb_regex_encoding("UTF-8");
  432.             mb_ereg_search_init($site[$value]);
  433.             mb_ereg_search($reg);
  434.             $match mb_ereg_search_getregs();
  435.             if(isset($match[0])) {
  436.                 $len_c mb_strlen($match[0]);
  437.                 if(($match[6== "'" || $match[6== '"'&&
  438.                    $match[3!= $match[6]{
  439.                     $len_c--;
  440.                 }
  441.                 $start_charset strpos($site[$value]$match[0]);
  442.                 if($start_charset $len_c $end_head{
  443.                     if(isset($match[4])) {
  444.                         $site[CrawlConstants::ENCODINGstrtoupper(
  445.                             $match[4]);
  446.                         $site[$valuesubstr_replace(
  447.                             $site[$value]""$start_charset,
  448.                             $len_c);
  449.                     }
  450.                 }
  451.             }
  452.         }
  453.         if(!isset($site[CrawlConstants::ENCODING])) {
  454.             //else  fallback to auto-detect
  455.             $site[CrawlConstants::ENCODING=
  456.                 mb_detect_encoding($site[$value]'auto');
  457.         }
  458.  
  459.         if(!isset($site[CrawlConstants::SERVER]) ) {
  460.             $site[CrawlConstants::SERVER"unknown";
  461.         }
  462.         return $site;
  463.     }
  464.  
  465.     /**
  466.      * Computes the IP address from http get-responser header
  467.      *
  468.      * @param string contains complete transcript of HTTP get/response
  469.      * @return string IPv4 address as a string of dot separated quads.
  470.      */
  471.     static function getCurlIp($header)
  472.     {
  473.         if (preg_match_all('/Trying\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/',
  474.             $header$matches)) {
  475.             return array_unique($matches[1]);
  476.         else {
  477.             return false;
  478.         }
  479.     }
  480.  
  481.  
  482.     /**
  483.      *  Make a curl request for the provide url
  484.      *
  485.      *  @param string $site  url of page to request
  486.      *  @param string $post_data  any data to be POST'd to the URL
  487.      *
  488.      *  @return string the contents of what the curl request fetched
  489.      */
  490.     static function getPage($site$post_data NULL)
  491.     {
  492.         static $agents array();
  493.         $MAX_SIZE 50;
  494.         $host @parse_url($site,PHP_URL_HOST);
  495.         if($host !== false{
  496.             if(count($agents$MAX_SIZE{
  497.                 array_shift($agents);
  498.             }
  499.             if(!isset($agents[$host])) {
  500.                 $agents[$hostcurl_init();
  501.             }
  502.         }
  503.         crawlLog("  Init curl request of a single page");
  504.         curl_setopt($agents[$host]CURLOPT_USERAGENTUSER_AGENT);
  505.         curl_setopt($agents[$host]CURLOPT_URL$site);
  506.  
  507.         curl_setopt($agents[$host]CURLOPT_AUTOREFERERtrue);
  508.         curl_setopt($agents[$host]CURLOPT_FOLLOWLOCATIONtrue);
  509.         curl_setopt($agents[$host]CURLOPT_SSL_VERIFYHOST0);
  510.         curl_setopt($agents[$host]CURLOPT_NOSIGNALtrue);
  511.         curl_setopt($agents[$host]CURLOPT_RETURNTRANSFERtrue);
  512.         curl_setopt($agents[$host]CURLOPT_FAILONERRORtrue);
  513.         curl_setopt($agents[$host]CURLOPT_TIMEOUTSINGLE_PAGE_TIMEOUT);
  514.         curl_setopt($agents[$host]CURLOPT_CONNECTTIMEOUTPAGE_TIMEOUT);
  515.         //make lighttpd happier
  516.         curl_setopt($agents[$host]CURLOPT_HTTPHEADERarray('Expect:'));
  517.         if($post_data != NULL{
  518.             curl_setopt($agents[$host]CURLOPT_POSTtrue);
  519.             curl_setopt($agents[$host]CURLOPT_POSTFIELDS$post_data);
  520.         else {
  521.             // since we are caching agents, need to do this so doesn't get stuck
  522.             // as post and so query string ignored for get's
  523.             curl_setopt($agents[$host]CURLOPT_HTTPGETtrue);
  524.         }
  525.         crawlLog("  Set curl options for single page request");
  526.         $time time();
  527.         $response curl_exec($agents[$host]);
  528.         if(time($time PAGE_TIMEOUT{
  529.             crawlLog("  Request took longer than page timeout!!");
  530.             crawlLog("  Either could not reach URL or website took too");
  531.             crawlLog("  long to respond.");
  532.         }
  533.         curl_setopt($agents[$host]CURLOPT_POSTFIELDS"");
  534.         crawlLog("  Done curl exec");
  535.         return $response;
  536.     }
  537. }
  538. ?>

Documentation generated by phpDocumentor 1.4.3