Last commit for src/configs/default_crawl.ini: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
; ***** BEGIN LICENSE BLOCK *****
;  SeekQuarry/Yioop Open Source Pure PHP Search Engine, Crawler, and Indexer
;  Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
;
;  This program is free software: you can redistribute it and/or modify
;  it under the terms of the GNU General Public License as published by
;  the Free Software Foundation, either version 3 of the License, or
;  (at your option) any later version.
;
;  This program is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;  GNU General Public License for more details.
;
;  You should have received a copy of the GNU General Public License
;  along with this program.  If not, see <https://www.gnu.org/licenses/>.
;  ***** END LICENSE BLOCK *****
;
; crawl.ini
;
; Crawl configuration file
;
[general]
crawl_order = 'ad';
max_depth = '-1';
repeat_type = '-1';
sleep_start = '00:00';
sleep_duration = '-1';
robot_txt = '1';
channel = '0';
summarizer_option = 'dk';
crawl_type = 'ax';
crawl_index = '1483056689';
arc_dir = "";
arc_type = "";
page_recrawl_frequency = '-1';
page_range_request = '100000';
max_description_len = '10000';
max_links_to_extract = '50';
cache_pages = true;
restrict_sites_by_url = false;

[indexed_file_types]
extensions[] = 'unknown';
extensions[] = 'bmp';
extensions[] = 'gz';
extensions[] = 'bz';
extensions[] = 'csv';
extensions[] = 'tab';
extensions[] = 'tsv';
extensions[] = 'txt';
extensions[] = 'doc';
extensions[] = 'docx';
extensions[] = 'epub';
extensions[] = 'gif';
extensions[] = 'xml';
extensions[] = 'asp';
extensions[] = 'aspx';
extensions[] = 'cgi';
extensions[] = 'cfm';
extensions[] = 'cfml';
extensions[] = 'do';
extensions[] = 'htm';
extensions[] = 'html';
extensions[] = 'ico';
extensions[] = 'jsp';
extensions[] = 'php';
extensions[] = 'pl';
extensions[] = 'py';
extensions[] = 'shtml';
extensions[] = 'java';
extensions[] = 'jpg';
extensions[] = 'jpeg';
extensions[] = 'pdf';
extensions[] = 'png';
extensions[] = 'ppt';
extensions[] = 'pptx';
extensions[] = 'rss';
extensions[] = 'rtf';
extensions[] = 'svg';
extensions[] = 'xlsx';

[active_classifiers]

[active_rankers]

[allowed_sites]
url[] = 'http://www.yahoo.com/';
url[] = 'http://www.youtube.com/';
url[] = 'http://www.google.com/';

[disallowed_sites]
url[] = 'domain:arxiv.org';
url[] = 'domain:ask.com';
url[] = 'domain:yelp.com';
url[] = 'domain:clixsense.com';

[seed_sites]
url[] = 'https://www.wikipedia.org/';
url[] = 'https://www.yahoo.com/';
url[] = 'https://www.google.com/';
url[] = 'https://www.amazon.com/';
url[] = 'https://www.bing.com/';
url[] = 'https://www.reddit.com/';
url[] = 'https://www.twitter.com/';
url[] = 'https://imgur.com/';
url[] = 'https://www.youtube.com/';
url[] = 'https://www.ebay.com/';
url[] = 'https://www.twitch.tv/';
url[] = 'https://www.pinterest.com/';
url[] = 'https://www.office.com/';
url[] = 'http://www.wikia.com/';
url[] = 'https://www.blogger.com/';
url[] = 'https://www.craigslist.org/';
url[] = 'https://www.cnn.com/';
url[] = 'https://www.cnet.com/';
url[] = 'https://www.adobe.com/';
url[] = 'https://weather.com/';
url[] = 'https://digg.com/';
url[] = 'https://www.paypal.com/';
url[] = 'https://www.tumblr.com/';
url[] = 'https://www.indeed.com/';
url[] = 'https://www.imdb.com/';
url[] = 'https://archive.org/';
url[] = 'https://www.zynga.com/';
url[] = 'https://eccc.weizmann.ac.il/';
url[] = 'http://citeseerx.ist.psu.edu/';
url[] = 'https://www.zillow.com/';
url[] = 'https://www.wolframalpha.com/';
url[] = 'https://www.sourceforge.net/';
url[] = 'https://www.huffingtonpost.com/';
url[] = 'https://www.wikimedia.org/';
url[] = 'https://www.reference.com/';
url[] = 'https://www.dell.com/';
url[] = 'http://www.metacafe.com/';
url[] = 'https://www.foxnews.com/';
url[] = 'https://www.hp.com/';
url[] = 'https://www.apple.com/';
url[] = 'https://www.stumbleupon.com/';
url[] = 'https://wordpress.org/';
url[] = 'https://www.bankofamerica.com/';
url[] = 'https://www.xing.com/';
url[] = 'https://www.microsoft.com/';
url[] = 'https://www.theguardian.com/';
url[] = 'https://www.skyrock.com/';
url[] = 'http://www.dailymail.co.uk/';
url[] = 'https://www.ign.com/';
url[] = 'https://www.mozilla.org/';
url[] = 'https://vimeo.com/';
url[] = 'https://www.wsj.com/';
url[] = 'https://www.walmart.com/';
url[] = 'https://www.reuters.com/';
url[] = 'https://www.usps.com/';
url[] = 'https://www.telegraph.co.uk/';
url[] = 'http://www.babylon.com/';
url[] = 'https://www.ups.com/';
url[] = 'https://www.mapquest.com/';
url[] = 'https://www.openstreetmap.org/';
url[] = 'https://www.bestbuy.com/';
url[] = 'https://www.verizon.com/';
url[] = 'https://www.latimes.com/';
url[] = 'https://www.washingtonpost.com/';
url[] = 'https://www.att.com/';
url[] = 'https://www.w3schools.com/';
url[] = 'https://www.fox.com/';
url[] = 'https://www.ibm.com/';
url[] = 'https://www.engadget.com/';
url[] = 'https://www.usatoday.com/';
url[] = 'https://www.chase.com/';
url[] = 'https://www.wellsfargo.com/';
url[] = 'https://www.nih.gov/';
url[] = 'https://www.irs.gov/';
url[] = 'https://www.ftb.ca.gov/';
url[] = 'https://www.monster.com/';
url[] = 'https://www.thetimes.co.uk/';
url[] = 'https://www.careerbuilder.com/';
url[] = 'http://www.abcnews.go.com/';
url[] = 'https://www.tmz.com/';
url[] = 'http://www.fedex.com/';
url[] = 'https://www.snopes.com/';
url[] = 'https://www.urbandictionary.com/';
url[] = 'https://slashdot.org/';
url[] = 'https://secure.php.net/';
url[] = 'https://www.intuit.com/';
url[] = 'https://www.thesun.co.uk/';

[page_rules]

[indexing_plugins]
ViewGit