diff --git a/configs/config.php b/configs/config.php
index b2e841e0f..9e5e3e35a 100644
--- a/configs/config.php
+++ b/configs/config.php
@@ -246,43 +246,73 @@ define('MAXIMUM_CRAWL_DELAY', 64);
/** maximum number of active crawl-delayed hosts */
define('MAX_WAITING_HOSTS', 250);
+/** Minimum weight in priority queue before rebuilt */
+define('MIN_QUEUE_WEIGHT', 1/100000);
+
+/** largest sized object allowed in a web archive (used to sanity check
+ reading data out of a web archive)
+*/
+define('MAX_ARCHIVE_OBJECT_SIZE', 100000000);
+/**
+ * Code to determine how much memory current machine has
+ */
+if(strstr(PHP_OS, "WIN")) {
+ exec('wmic memorychip get capacity', $memory_array);
+ $memory = array_sum($memory_array);
+} else {
+ exec('free -m', $memory);
+ $memory = intval($memory);
+}
+/**
+ * Factor to multiply sizes of Yioop data structures with in low ram memory
+ * setting (2GB)
+ */
+define('MEMORY_LOW', 1);
+/**
+ * Factor to multiply sizes of Yioop data structures with if have more than
+ * (2GB)
+ */
+define('MEMORY_STANDARD', 4);
+if($memory < 2000000000) {
+ /**
+ * Based on system memory, either the low or high memory factor
+ */
+ define('MEMORY_PROFILE', MEMORY_LOW);
+} else {
+ /**
+ * @ignore
+ */
+ define('MEMORY_PROFILE', MEMORY_HIGH);
+}
/**
* bloom filters are used to keep track of which urls are visited,
* this parameter determines up to how many
* urls will be stored in a single filter. Additional filters are
* read to and from disk.
*/
-define('URL_FILTER_SIZE', 20000000);
+define('URL_FILTER_SIZE', MEMORY_PROFILE * 5000000);
/**
* maximum number of urls that will be held in ram
* (as opposed to in files) in the priority queue
*/
-define('NUM_URLS_QUEUE_RAM', 300000);
-
-/** Minimum weight in priority queue before rebuilt */
-define('MIN_QUEUE_WEIGHT', 1/100000);
-
-/** largest sized object allowedin a web archive (used to sanity check
- reading data out of a web archive)
-*/
-define('MAX_ARCHIVE_OBJECT_SIZE', 100000000);
+define('NUM_URLS_QUEUE_RAM', MEMORY_PROFILE * 80000);
/** number of documents before next gen */
-define('NUM_DOCS_PER_GENERATION', 50000);
+define('NUM_DOCS_PER_GENERATION', MEMORY_PROFILE *10000);
/** precision to round floating points document scores */
define('PRECISION', 10);
/** maximum number of links to extract from a page on an initial pass*/
-define('MAX_LINKS_TO_EXTRACT', 300);
+define('MAX_LINKS_TO_EXTRACT', MEMORY_PROFILE * 80);
/** maximum number of links to keep after initial extraction*/
define('MAX_LINKS_PER_PAGE', 50);
/** maximum number of links to consider from a sitemap page */
-define('MAX_LINKS_PER_SITEMAP', 300);
+define('MAX_LINKS_PER_SITEMAP', MEMORY_PROFILE * 80);
/** maximum number of words from links to consider on any given page */
define('MAX_LINKS_WORD_TEXT', 100);
@@ -317,14 +347,6 @@ define('NUM_MULTI_CURL_PAGES', 100);
/** number of pages to extract from an archive in one go */
define('ARCHIVE_BATCH_SIZE', 100);
-/**
- * Time in seconds to wait to acquire an exclusive lock before we're no longer
- * allowed to extract the next batch of pages for an archive crawl. This is
- * intended to prevent a fetcher from waiting to acquire the lock, then
- * getting it just before cURL gives up and times out the request.
- */
-define('ARCHIVE_LOCK_TIMEOUT', 8);
-
/** time in seconds before we give up on multi page requests*/
define('PAGE_TIMEOUT', 30);
@@ -459,7 +481,9 @@ define ('EN_RATIO', 0.9);
/** Number of total description deemed title */
define ('AD_HOC_TITLE_LENGTH', 10);
-/** Used to say number of bytes in histogram bar for file download sizes*/
+/** Used to say number of bytes in histogram bar (stats page) for file
+ download sizes
+ */
define('DOWNLOAD_SIZE_INTERVAL', 5000);
/** Used to say number of secs in histogram bar for file download times*/
@@ -469,10 +493,10 @@ define('DOWNLOAD_TIME_INTERVAL', 0.5);
* How many non robot urls the fetcher successfully downloads before
* between times data sent back to queue server
*/
-define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', 500);
+define ('SEEN_URLS_BEFORE_UPDATE_SCHEDULER', MEMORY_PROFILE * 100);
/** maximum number of urls to schedule to a given fetcher in one go */
-define ('MAX_FETCH_SIZE', 5000);
+define ('MAX_FETCH_SIZE', MEMORY_PROFILE * 1000);
/** fetcher must wait at least this long between multi-curl requests */
define ('MINIMUM_FETCH_LOOP_TIME', 5);
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index fe2e68589..99d9f91ec 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -428,8 +428,8 @@ class FetchUrl implements CrawlConstants
$end_head = stripos($site[$value], "</head");
if($end_head) {
$reg = "/charset(\s*)=(\s*)(\'|\")?((\w|\-)+)(\'|\")?/u";
- preg_match($reg, $site[$value], $match);
- if(isset($match[0])) {
+ $is_match = preg_match($reg, $site[$value], $match);
+ if($is_match && isset($match[6])) {
$len_c = strlen($match[0]);
if(($match[6] == "'" || $match[6] == '"') &&
$match[3] != $match[6]) {