diff --git a/index.php b/index.php index c65cb9e00..fc9555073 100644 --- a/index.php +++ b/index.php @@ -43,6 +43,6 @@ function passthruYioopRequest() exit(); } define("seekquarry\\yioop\\configs\\REDIRECTS_ON", true); - require_once "src/index.php"; + require_once __DIR__."/src/index.php"; } passthruYioopRequest(); diff --git a/src/controllers/RegisterController.php b/src/controllers/RegisterController.php index afebdf2bb..118f5348e 100755 --- a/src/controllers/RegisterController.php +++ b/src/controllers/RegisterController.php @@ -34,6 +34,7 @@ use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\MailServer; +use seekquarry\yioop\library\UrlParser; use seekquarry\yioop\models\LocaleModel; /** diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php index 50d530f7d..4c5240fc7 100755 --- a/src/controllers/SearchController.php +++ b/src/controllers/SearchController.php @@ -1564,6 +1564,9 @@ class SearchController extends Controller implements CrawlConstants } set_error_handler(C\NS_LIB . "yioop_error_handler"); $body = $dom->getElementsByTagName('body')->item(0); + if(!$body) { + return $cache_file; + } //make tags in body absolute $body = $this->canonicalizeLinks($body, $url); $first_child = $body->firstChild; diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 6f4b5f6fd..95c400572 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -2017,6 +2017,8 @@ class QueueServer implements CrawlConstants, Join * It is possible that a large schedule file is created if someone * pastes more than MAX_FETCH_SIZE many urls into the initial seed sites * of a crawl in the UI. + * + * @param array& $sites array containing to crawl data */ public function dumpBigScheduleToSmall(&$sites) { diff --git a/src/library/BloomFilterFile.php b/src/library/BloomFilterFile.php index 07e2bcb8a..ac9217dbd 100755 --- a/src/library/BloomFilterFile.php +++ b/src/library/BloomFilterFile.php @@ -33,7 +33,7 @@ namespace seekquarry\yioop\library; /** * For packInt/unpackInt */ -require_once "Utility.php"; +require_once __DIR__."/Utility.php"; /** * Code used to manage a bloom filter in-memory and in file. diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index a526bfad3..860ab5938 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -35,7 +35,7 @@ use seekquarry\yioop\configs as C; /** * Used for crawlLog and crawlHash */ -require_once 'Utility.php'; +require_once __DIR__.'/Utility.php'; /** * Encapsulates a set of web page summaries and an inverted word-index of terms * from these summaries which allow one to search for summaries containing a diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index 1a0d1c3cf..e8e10f33a 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -35,7 +35,7 @@ use seekquarry\yioop\configs as C; /** * For crawlHash */ -require_once "Utility.php"; +require_once __DIR__."/Utility.php"; /** * Class used to manage open IndexArchiveBundle's while performing * a query. Ensures an easy place to obtain references to these bundles diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index c10dc5ba3..02bb4b32b 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -35,7 +35,7 @@ use seekquarry\yioop\configs as C; /** * Load charCopy */ -require_once "Utility.php"; +require_once __DIR__."/Utility.php"; /** * Data structure used to store one generation worth of the word document * index (inverted index). diff --git a/src/library/MailServer.php b/src/library/MailServer.php index 9d619cccc..3f06003ae 100644 --- a/src/library/MailServer.php +++ b/src/library/MailServer.php @@ -37,7 +37,7 @@ use seekquarry\yioop\library\MediaConstants; /** * Timing functions */ -require_once "Utility.php"; +require_once __DIR__."/Utility.php"; /** * A small class for communicating with an SMTP server. Used to avoid * configuration issues that might be needed with PHP's built-in mail() diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 88d269c2d..e7c4b9c00 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -37,11 +37,11 @@ use seekquarry\yioop\library\processors\PageProcessor; /** * For crawlHash */ -require_once "Utility.php"; +require_once __DIR__."/Utility.php"; /** * So know which part of speech tagger to use */ -require_once "LocaleFunctions.php"; +require_once __DIR__."/LocaleFunctions.php"; /** * Library of functions used to manipulate words and phrases * diff --git a/src/library/PriorityQueue.php b/src/library/PriorityQueue.php index a6c9624f1..98921a00a 100755 --- a/src/library/PriorityQueue.php +++ b/src/library/PriorityQueue.php @@ -35,7 +35,7 @@ use seekquarry\yioop\configs as C; /** * Loaded for crawlLog function */ -require_once "Utility.php"; +require_once __DIR__."/Utility.php"; /** * * Code used to manage a memory efficient priority queue. diff --git a/src/library/StringArray.php b/src/library/StringArray.php index 01f6ec2e3..ccfd367ce 100755 --- a/src/library/StringArray.php +++ b/src/library/StringArray.php @@ -33,7 +33,7 @@ namespace seekquarry\yioop\library; /** * Load charCopy */ -require_once "Utility.php"; +require_once __DIR__."/Utility.php"; /** * Memory efficient implementation of persistent arrays * diff --git a/src/library/Utility.php b/src/library/Utility.php index eb62d0d7a..d2c137a3a 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -70,8 +70,10 @@ function yioop_error_handler($errno, $errstr, $errfile, $errline, if (isset($call['function'])) { $function .= $call['function']; } - echo " $in_or_called $function, line {$call['line']}". - " in {$call['file']} \n"; + $line = (isset($call['line'])) ? $call['line'] : ""; + $file = (isset($call['file'])) ? $call['file'] : ""; + echo " $in_or_called $function, line $line". + " in $file \n"; $in_or_called = "called from"; $i++; if ($i >= $num_lines_of_backtrace) {break; } diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php index efbad7157..5504d70c8 100755 --- a/src/library/WebArchive.php +++ b/src/library/WebArchive.php @@ -35,7 +35,7 @@ use seekquarry\yioop\configs as C; /** * Loads crawlLog functions if needed */ -require_once "Utility.php"; +require_once __DIR__."/Utility.php"; /** * * Code used to manage web archive files diff --git a/src/library/WebQueueBundle.php b/src/library/WebQueueBundle.php index 5d7501449..5e9e44fa3 100755 --- a/src/library/WebQueueBundle.php +++ b/src/library/WebQueueBundle.php @@ -36,7 +36,7 @@ use seekquarry\yioop\library\compressors\NonCompressor; /** * Used for the crawlHash function */ -require_once 'Utility.php'; +require_once __DIR__.'/Utility.php'; /** * Encapsulates the data structures needed to have a queue of to crawl urls * diff --git a/src/library/archive_bundle_iterators/WebArchiveBundleIterator.php b/src/library/archive_bundle_iterators/WebArchiveBundleIterator.php index 3c6314784..b03097ace 100644 --- a/src/library/archive_bundle_iterators/WebArchiveBundleIterator.php +++ b/src/library/archive_bundle_iterators/WebArchiveBundleIterator.php @@ -35,7 +35,7 @@ use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\WebArchiveBundle; /** For crawlTimeoutLog */ -require_once BASE_DIR.'/library/Utility.php'; +require_once __DIR__.'/../Utility.php'; /** * Class used to model iterating documents indexed in * an WebArchiveBundle. This would typically be for the purpose diff --git a/src/models/Model.php b/src/models/Model.php index 4fb906790..d501c4016 100755 --- a/src/models/Model.php +++ b/src/models/Model.php @@ -213,6 +213,9 @@ class Model implements CrawlConstants * @param string $text haystack to extract snippet from * @param array $words keywords used to make look in haystack * @param string $description_length length of the description desired + * @param bool $words_change getSnippets might be called many times on + * the same search page with the same $words, if true then the + * preprocessing of $words is avoided and cached versions are used * @return string a concatenation of the extracted snippets of each word */ public function getSnippets($text, $words, $description_length,