seek_quarry
[ class tree: seek_quarry ] [ index: seek_quarry ] [ all elements ]

Procedural File: config.php

Source Location: /configs/config.php



Page Details:

SeekQuarry/Yioop -- Open Source Pure PHP Search Engine, Crawler, and Indexer

Copyright (C) 2009 - 2013 Chris Pollett chris@pollett.org

LICENSE:

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.

END LICENSE

Used to set the configuration settings of the SeekQuarry project.




Tags:

author:  Chris Pollett chris@pollett.org
copyright:  2009 - 2013
link:  http://www.seekquarry.com/
filesource:  Source Code for this file
license:  GPL3


Includes:

require_once(BASE_DIR."/configs/local_config.php") [line 55]
Include any locally specified defines (could use as an alternative

require_once(WORK_DIRECTORY.PROFILE_FILE_NAME) [line 102]





AD_HOC_TITLE_LENGTH [line 460]

AD_HOC_TITLE_LENGTH = 10
Number of total description deemed title


[ Top ]



API_ACCESS [line 135]

API_ACCESS = true

[ Top ]



APP_DIR [line 90]

APP_DIR = WORK_DIRECTORY."/app"

[ Top ]



ARCHIVE_BATCH_SIZE [line 319]

ARCHIVE_BATCH_SIZE = 100
number of pages to extract from an archive in one go


[ Top ]



ARCHIVE_LOCK_TIMEOUT [line 327]

ARCHIVE_LOCK_TIMEOUT = 8
Time in seconds to wait to acquire an exclusive lock before we're no longer allowed to extract the next batch of pages for an archive crawl. This is intended to prevent a fetcher from waiting to acquire the lock, then getting it just before cURL gives up and times out the request.


[ Top ]



AUTH_KEY [line 149]

AUTH_KEY = 0

[ Top ]



CACHE_LINK [line 152]

CACHE_LINK = true

[ Top ]



CACHE_ROBOT_TXT_TIME [line 238]

CACHE_ROBOT_TXT_TIME = 86400
how long in seconds to keep a cache of a robot.txt

file before re-requesting it



[ Top ]



CRAWL_DIR [line 104]

CRAWL_DIR = WORK_DIRECTORY

[ Top ]



CRAWL_TIME_OUT [line 497]

CRAWL_TIME_OUT = 1800
Number of seconds of no fetcher contact before crawl is deemed dead


[ Top ]



CSRF_TOKEN [line 99]

CSRF_TOKEN = "YIOOP_TOKEN"

[ Top ]



DBMS [line 130]

DBMS = 'sqlite3'

[ Top ]



DB_HOST [line 113]

DB_HOST = DB_URL

[ Top ]



DB_NAME [line 136]

DB_NAME = "default"

[ Top ]



DB_PASSWORD [line 138]

DB_PASSWORD = ''

[ Top ]



DB_USER [line 137]

DB_USER = ''

[ Top ]



DEBUG_LEVEL [line 131]

DEBUG_LEVEL = NO_DEBUG_INFO

[ Top ]



DEFAULT_LOCALE [line 148]

DEFAULT_LOCALE = "en-US"

[ Top ]



DESCRIPTION_WEIGHT [line 161]

DESCRIPTION_WEIGHT = 1
BM25F weight for other text within doc


[ Top ]



DISPLAY_TESTS [line 183]

DISPLAY_TESTS = ((DEBUG_LEVEL&TEST_INFO)==TEST_INFO)
if true tests are diplayable


[ Top ]



DOWNLOAD_ERROR_THRESHOLD [line 339]

DOWNLOAD_ERROR_THRESHOLD = 10
Number of error page 400 or greater seen from a host before crawl-delay

host and dump remainder from current schedule



[ Top ]



DOWNLOAD_SIZE_INTERVAL [line 463]

DOWNLOAD_SIZE_INTERVAL = 5000
Used to say number of bytes in histogram bar for file download sizes


[ Top ]



DOWNLOAD_TIME_INTERVAL [line 466]

DOWNLOAD_TIME_INTERVAL = 0.5
Used to say number of secs in histogram bar for file download times


[ Top ]



EN_RATIO [line 457]

EN_RATIO = 0.9
Percentage ASCII text before guess we dealing with english


[ Top ]



ERROR_CRAWL_DELAY [line 342]

ERROR_CRAWL_DELAY = 20
Crawl-delay to set in the event that DOWNLOAD_ERROR_THRESHOLD exceeded


[ Top ]



ERROR_INFO [line 47]

ERROR_INFO = 4
bit of DEBUG_LEVEL used to indicate php messages should be displayed


[ Top ]



FALLBACK_LOCALE_DIR [line 93]

FALLBACK_LOCALE_DIR = BASE_DIR."/locale"

[ Top ]



FETCH_SLEEP_TIME [line 481]

FETCH_SLEEP_TIME = 15
an idling fetcher sleeps this long between queue_server pings


[ Top ]



FIX_NAME_SERVER [line 119]

FIX_NAME_SERVER = true

[ Top ]



FORCE_SAVE_TIME [line 494]

FORCE_SAVE_TIME = 3600
Max time before dirty index (queue_server) and


[ Top ]



IN_LINK [line 154]

IN_LINK = true

[ Top ]



IP_LINK [line 155]

IP_LINK = true

[ Top ]



LINK_WEIGHT [line 163]

LINK_WEIGHT = 2
BM25F weight for other text within links to a doc


[ Top ]



LOCALE_DIR [line 106]

LOCALE_DIR = WORK_DIRECTORY."/locale"

[ Top ]



LOG_DIR [line 111]

LOG_DIR = WORK_DIRECTORY."/log"

[ Top ]



MAINTENANCE_MODE [line 50]

MAINTENANCE_MODE = false
Maintenance mode restricts access to local machine


[ Top ]



MAXIMUM_CRAWL_DELAY [line 245]

MAXIMUM_CRAWL_DELAY = 64
if the robots.txt has a Crawl-delay larger than this value don't crawl the site.

maximum value for this is 255



[ Top ]



MAX_ARCHIVE_OBJECT_SIZE [line 271]

MAX_ARCHIVE_OBJECT_SIZE = 100000000
largest sized object allowedin a web archive (used to sanity check


[ Top ]



MAX_FETCH_SIZE [line 475]

MAX_FETCH_SIZE = 5000
maximum number of urls to schedule to a given fetcher in one go


[ Top ]



MAX_LINKS_PER_PAGE [line 283]

MAX_LINKS_PER_PAGE = 50
maximum number of links to keep after initial extraction


[ Top ]



MAX_LINKS_PER_SITEMAP [line 286]

MAX_LINKS_PER_SITEMAP = 300
maximum number of links to consider from a sitemap page


[ Top ]



MAX_LINKS_TO_EXTRACT [line 280]

MAX_LINKS_TO_EXTRACT = 300
maximum number of links to extract from a page on an initial pass


[ Top ]



MAX_LINKS_WORD_TEXT [line 289]

MAX_LINKS_WORD_TEXT = 100
maximum number of words from links to consider on any given page


[ Top ]



MAX_LOG_FILE_SIZE [line 229]

MAX_LOG_FILE_SIZE = 5000000
maximum size of a log file before it is rotated


[ Top ]



MAX_PHRASE_LEN [line 313]

MAX_PHRASE_LEN = 2
maximum length +1 exact phrase matches


[ Top ]



MAX_QUERY_TERMS [line 500]

MAX_QUERY_TERMS = 10
maximum number of terms allowed in a conjunctive search query


[ Top ]



MAX_URL_LENGTH [line 297]

MAX_URL_LENGTH = 512
maximum length of urls to try to queue, this is important for

memory when creating schedule, since the amount of memory is going to be greater than the product MAX_URL_LENGTH*MAX_FETCH_SIZE text_processors need to promise to implement this check or rely on the base class which does implement it in extractHttpHttpsUrls



[ Top ]



MAX_WAITING_HOSTS [line 248]

MAX_WAITING_HOSTS = 250
maximum number of active crawl-delayed hosts


[ Top ]



MINIMUM_FETCH_LOOP_TIME [line 478]

MINIMUM_FETCH_LOOP_TIME = 5
fetcher must wait at least this long between multi-curl requests


[ Top ]



MIN_QUEUE_WEIGHT [line 266]

MIN_QUEUE_WEIGHT = 1/100000
Minimum weight in priority queue before rebuilt


[ Top ]



MIN_RESULTS_TO_GROUP [line 167]

MIN_RESULTS_TO_GROUP = 200
If that many exist, the minimum number of results to get


[ Top ]



MIRROR_NOTIFY_FREQUENCY [line 490]

MIRROR_NOTIFY_FREQUENCY = 60
How often mirror script tries to notify machine it is mirroring that it


[ Top ]



MIRROR_SYNC_FREQUENCY [line 487]

MIRROR_SYNC_FREQUENCY = 3600
How often mirror script tries to synchronize with machine it is mirroring


[ Top ]



MOBILE [line 194]

MOBILE = true

[ Top ]



NAME_SERVER [line 116]

NAME_SERVER = QUEUE_SERVER

[ Top ]



NEWS_MODE [line 157]

NEWS_MODE = 'news_off'

[ Top ]



NORMALIZE_FREQUENCY [line 346]

NORMALIZE_FREQUENCY = 10000
how often should we make in OPIC the sum of weights totals MAX_URLS


[ Top ]



NO_DEBUG_INFO [line 41]

NO_DEBUG_INFO = 0
Don't display any query info


[ Top ]



NUMBER_OF_LOG_FILES [line 232]

NUMBER_OF_LOG_FILES = 5
number of log files to rotate amongst


[ Top ]



NUM_DOCS_PER_GENERATION [line 274]

NUM_DOCS_PER_GENERATION = 50000
number of documents before next gen


[ Top ]



NUM_MULTI_CURL_PAGES [line 316]

NUM_MULTI_CURL_PAGES = 100
number of multi curl page requests in one go


[ Top ]



NUM_RECENT_URLS_TO_DISPLAY [line 506]

NUM_RECENT_URLS_TO_DISPLAY = 10
Number of recently crawled urls to display on admin screen


[ Top ]



NUM_RESULTS_PER_PAGE [line 503]

NUM_RESULTS_PER_PAGE = 10
default number of search results to display per page


[ Top ]



NUM_URLS_QUEUE_RAM [line 263]

NUM_URLS_QUEUE_RAM = 300000
maximum number of urls that will be held in ram

(as opposed to in files) in the priority queue



[ Top ]



PAGE_RANGE_REQUEST [line 302]

PAGE_RANGE_REQUEST = 50000
request this many bytes out of a page -- this is the default value to


[ Top ]



PAGE_RECRAWL_FREQUENCY [line 310]

PAGE_RECRAWL_FREQUENCY = -1

[ Top ]



PAGE_TIMEOUT [line 330]

PAGE_TIMEOUT = 30
time in seconds before we give up on multi page requests


[ Top ]



PRECISION [line 277]

PRECISION = 10
precision to round floating points document scores


[ Top ]



PREP_DIR [line 91]

PREP_DIR = WORK_DIRECTORY."/prepare"

[ Top ]



PROFILE [line 103]

PROFILE = true

[ Top ]



PROFILE_FILE_NAME [line 72]

PROFILE_FILE_NAME = "/profile.php"
setting profile.php to something else in loac_config.php allows one to have


[ Top ]



PUNCT [line 453]

PUNCT = "\.|\,|\:|\;|\"|\'|\[|\/|\%|"."\]|\{|\}|\(|\)|\!|\||\&|\`|\’|\‘|©|®|™|℠|… |- |\/ |\>|,|\="
Characters we view as not part of words, not same as POSIX [:punct:]


[ Top ]



QUERY_INFO [line 45]

QUERY_INFO = 2
bit of DEBUG_LEVEL used to indicate query statistics should be displayed


[ Top ]



QUERY_STATISTICS [line 186]

QUERY_STATISTICS = ((DEBUG_LEVEL&QUERY_INFO)==QUERY_INFO)
if true query statistics are diplayed


[ Top ]



QUEUE_SLEEP_TIME [line 484]

QUEUE_SLEEP_TIME = 5
an a queue_server minimum loop idle time


[ Top ]



RSS_ACCESS [line 134]

RSS_ACCESS = true

[ Top ]



SEEN_URLS_BEFORE_UPDATE_SCHEDULER [line 472]

SEEN_URLS_BEFORE_UPDATE_SCHEDULER = 500
How many non robot urls the fetcher successfully downloads before

between times data sent back to queue server



[ Top ]



SERVER_ALPHA [line 171]

SERVER_ALPHA = 1.6
For a given number of search results total to return (total_num)


[ Top ]



SESSION_NAME [line 98]

SESSION_NAME = "yioopbiscuit"
name of the cookie used to manage the session

(store language and perpage settings), define CSRF token



[ Top ]



SIGNIN_LINK [line 156]

SIGNIN_LINK = true

[ Top ]



SIMILAR_LINK [line 153]

SIMILAR_LINK = true

[ Top ]



SINGLE_PAGE_TIMEOUT [line 333]

SINGLE_PAGE_TIMEOUT = 60
time in seconds before we give up on a single page request


[ Top ]



TEST_INFO [line 43]

TEST_INFO = 1
bit of DEBUG_LEVEL used to indicate test cases should be displayable


[ Top ]



TITLE_WEIGHT [line 159]

TITLE_WEIGHT = 4
BM25F weight for title text


[ Top ]



URL_FILTER_SIZE [line 257]

URL_FILTER_SIZE = 20000000
bloom filters are used to keep track of which urls are visited, this parameter determines up to how many urls will be stored in a single filter. Additional filters are read to and from disk.


[ Top ]



USER_AGENT [line 210]

USER_AGENT = 'Mozilla/5.0 (compatible; '.USER_AGENT_SHORT.'; +'.NAME_SERVER.'bot.php)'
this is the User-Agent names the crawler provides

a web-server it is crawling



[ Top ]



USER_AGENT_SHORT [line 147]

USER_AGENT_SHORT = "NeedsNameBot"

[ Top ]



USE_FILECACHE [line 132]

USE_FILECACHE = false

[ Top ]



USE_MEMCACHE [line 150]

USE_MEMCACHE = false

[ Top ]



WEB_ACCESS [line 133]

WEB_ACCESS = true

[ Top ]



WORD_SUGGEST [line 151]

WORD_SUGGEST = true

[ Top ]



WORK_DIRECTORY [line 84]

WORK_DIRECTORY = ''

[ Top ]




Documentation generated by phpDocumentor 1.4.3