Adjust copyrights years
<?php
/**
* SeekQuarry/Yioop --
* Open Source Pure PHP Search Engine, Crawler, and Indexer
*
* Copyright (C) 2009 - 2016 Chris Pollett chris@pollett.org
*
* LICENSE:
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* END LICENSE
*
* @author Charles Bocage (charles.bocage@sjsu.edu)
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
* @copyright 2009 - 2016
* @filesource
*/
namespace seekquarry\yioop\library;
/**
* Class used by html processors to detect if a page was made by a content
* management system, and also to provide scraping mechanisms for the content
* of such a page
*
* @author Charles Bocage (charles.bocage@sjsu.edu)
*/
class ScraperManager
{
/**
* Method used to check a page against a supplied list of scrapers
* for a matching signature. If a match is found that scraper is returned.
*
* @param string $page the html page to check
* @param array $scrapers an array of scrapers to check against
* @return array an associative array of scraper properties if a matching
* scraper signature found; otherwise, the empty array
*/
public static function getScraper($page, $scrapers)
{
$out_scraper = [];
foreach ($scrapers as $scraper) {
$out_scraper['SIGNATURE'] = html_entity_decode(
$scraper['SIGNATURE'], ENT_QUOTES);
if (self::checkSignature($page, $out_scraper['SIGNATURE'])) {
$out_scraper['ID'] = $scraper['ID'];
$out_scraper['SCRAPE_RULES'] = html_entity_decode(
$scraper['SCRAPE_RULES'], ENT_QUOTES);;
$out_scraper['NAME'] = $scraper['NAME'];
break;
}
}
return $out_scraper;
}
/**
* Applies scrape rules to a given page. A scrape rule consists of
* a sequence of xpaths delimited by ###. The first path is used
* extract content from the page, the remaining xpaths are used
* to delete content from the result.
*
* @param string $page the html page to operate on
* @param string $scrape_rules_string a string of xpaths with ###
* used as a delimeter
* @return string the result of extracting first xpath content and
* deleting from it according to the remaining xpath rules
*/
public static function applyScraperRules($page, $scrape_rules_string)
{
$scrape_rules = preg_split('/###/u',
$scrape_rules_string, 0, PREG_SPLIT_NO_EMPTY);
if (count($scrape_rules) > 0) {
$temp_page = self::getContentByXquery($page,
$scrape_rules[0]);
unset($scrape_rules[0]);
if (!empty($temp_page)) {
foreach ($scrape_rules as $tag_to_remove) {
$temp_content_to_remove =
self::getContentByXquery($page, $tag_to_remove);
$temp_page =
str_replace($temp_content_to_remove, "", $temp_page);
}
}
}
return empty($temp_page) ? $page : $temp_page;
}
/**
* If $signature begins with '/', checks to see if applying
* the xpath in $signature to $page results
* in a non-empty dom node list. Otherwise, does a match of the
* regex (without matching start and end delimiters (say, /)
* against $page and returns whether found
*
* @var string $page a web document to check
* @var string $signature an xpath to check against
* @return boolean true if the given xpath return a non empty dom node list
*/
public static function checkSignature($page, $signature)
{
if ($signature[0] == '/') {
$dom = new \DOMDocument();
@$dom->loadHTML($page);
$xpath = new \DOMXpath($dom);
$results = $xpath->query($signature);
return !empty($results->length) || $results->length > 0;
} else {
$regex = "/" . preg_quote($signature) . "/";
return (preg_match($regex, $page) == 1);
}
}
/**
* Get the contents of a document via an xpath
* @param string $page a document to apply the xpath query against
* @param string $query the xpath query to run
*
* @return string the content found as a string, otherwise an empty string
*/
public static function getContentByXquery($page, $query)
{
$result = "";
$dom = new \DOMDocument();
if (@$dom->loadHTML($page)) {
$xpath = new \DOMXPath($dom);
$tempResult = $xpath->query($query);
if ($tempResult->length > 0) {
$result = $dom->saveHTML($tempResult->item(0));
}
}
return $result;
}
}