Last commit for src/configs/default_crawl.ini: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
; ***** BEGIN LICENSE BLOCK *****
;  SeekQuarry/Yioop Open Source Pure PHP Search Engine, Crawler, and Indexer
;  Copyright (C) 2009 - 2017  Chris Pollett chris@pollett.org
;
;  This program is free software: you can redistribute it and/or modify
;  it under the terms of the GNU General Public License as published by
;  the Free Software Foundation, either version 3 of the License, or
;  (at your option) any later version.
;
;  This program is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;  GNU General Public License for more details.
;
;  You should have received a copy of the GNU General Public License
;  along with this program.  If not, see <http://www.gnu.org/licenses/>.
;  ***** END LICENSE BLOCK *****
;
; default_crawl.ini
;
; This is an example of a crawl.ini configuration file. If you mess up
; the crawl.ini you can simply delete it and this one will be used to recreate
; it
;
[general]
arc_dir = "";
arc_type = "";
crawl_order = 'ad';
crawl_type = 'ax';
page_range_request = '50000';
page_recrawl_frequency = '-1';
restrict_sites_by_url = false;
summarizer_option = 'dl';
max_description_len = '2000';

[indexed_file_types]
extensions[] = 'unknown';
extensions[] = 'bmp';
extensions[] = 'doc';
extensions[] = 'docx';
extensions[] = 'csv';
extensions[] = 'tab';
extensions[] = 'tsv';
extensions[] = 'txt';
extensions[] = 'epub';
extensions[] = 'asp';
extensions[] = 'aspx';
extensions[] = 'cgi';
extensions[] = 'cfm';
extensions[] = 'cfml';
extensions[] = 'do';
extensions[] = 'htm';
extensions[] = 'html';
extensions[] = 'jsp';
extensions[] = 'php';
extensions[] = 'pl';
extensions[] = 'py';
extensions[] = 'shtml';
extensions[] = 'gif';
extensions[] = 'xml';
extensions[] = 'java';
extensions[] = 'jpg';
extensions[] = 'jpeg';
extensions[] = 'pdf';
extensions[] = 'png';
extensions[] = 'ppt';
extensions[] = 'pptx';
extensions[] = 'py';
extensions[] = 'rss';
extensions[] = 'rtf';
extensions[] = 'svg';
extensions[] = 'xlsx';
extensions[] = 'xml';

[allowed_sites]
url[] = 'http://www.yahoo.com/';
url[] = 'http://www.youtube.com/';
url[] = 'http://www.google.com/';

[disallowed_sites]
url[] = 'domain:arxiv.org';
url[] = 'domain:ask.com';
url[] = 'domain:yelp.com';
url[] = 'domain:clixsense.com';

[seed_sites]
url[] = 'http://www.ucanbuyart.com/';
url[] = 'http://www.wikipedia.org/';
url[] = 'http://www.dmoz.org/';
url[] = 'http://www.yahoo.com/';
url[] = 'http://www.google.com/';
url[] = 'http://www.amazon.com/';
url[] = 'http://www.bing.com/';
url[] = 'http://www.facebook.com/';
url[] = 'http://www.blogger.com/';
url[] = 'http://www.myspace.com/';
url[] = 'http://www.craigslist.org/';
url[] = 'http://www.cnn.com/';
url[] = 'http://www.about.com/';
url[] = 'http://www.cnet.com/';
url[] = 'http://www.adobe.com/';
url[] = 'http://www.mozilla.com/';
url[] = 'http://www.weather.com/';
url[] = 'http://www.digg.com/';
url[] = 'http://www.zynga.com/';;
url[] = 'http://www.download.com/';
url[] = 'http://www.ebay.com/';
url[] = 'http://eccc.hpi-web.de/';
url[] = 'http://citeseerx.ist.psu.edu/';
url[] = 'http://www.archive.org/';
url[] = 'http://www.imdb.com/';
url[] = 'http://www.zillow.com/';
url[] = 'http://www.wolframalpha.com/';
url[] = 'http://www.youtube.com/';
url[] = 'http://www.sourceforge.net/';
url[] = 'http://www.huffingtonpost.com/';
url[] = 'http://www.wikimedia.org/';
url[] = 'http://www.reference.com/';
url[] = 'http://www.comcast.net/';
url[] = 'http://www.dell.com/';
url[] = 'http://www.metacafe.com/';
url[] = 'http://www.foxnews.com/';
url[] = 'http://www.hp.com/';
url[] = 'http://www.stumbleupon.com';
url[] = 'http://www.twitter.com/';
url[] = 'http://www.wordpress.org/';
url[] = 'http://www.bankofamerica.com/';
url[] = 'http://www.xing.com/';
url[] = 'http://www.microsoft.com/';
url[] = 'http://www.mybrowserbar.com/';
url[] = 'http://www.guardian.co.uk/';
url[] = 'http://www.skyrock.com/';
url[] = 'http://www.dailymail.co.uk/';
url[] = 'http://www.ign.com/';
url[] = 'http://www.mozilla.org/';
url[] = 'http://www.vimeo.com/';
url[] = 'http://www.wsj.com/';
url[] = 'http://www.walmart.com/';
url[] = 'http://www.reuters.com/';
url[] = 'http://www.usps.com/';
url[] = 'http://www.telegraph.co.uk/';
url[] = 'http://www.babylon.com/';
url[] = 'http://www.ups.com/';
url[] = 'http://www.mapquest.com/';
url[] = 'http://www.reddit.com/';
url[] = 'http://www.theplanet.com/';
url[] = 'http://bestbuy.com/';
url[] = 'http://www.verizon.net/';
url[] = 'http://www.onemanga.com/';
url[] = 'http://www.latimes.com/';
url[] = 'http://www.washingtonpost.com/';
url[] = 'http://www.att.com/';
url[] = 'http://www.w3schools.com/';
url[] = 'http://www.fox.com/';
url[] = 'http://www.ibm.com/';
url[] = 'http://www.engadget.com/';
url[] = 'http://www.usatoday.com/';
url[] = 'http://www.chase.com/';
url[] = 'http://www.wellsfargo.com/';
url[] = 'http://www.nih.gov';
url[] = 'http://www.irs.gov/';
url[] = 'http://www.ftb.ca.gov/';
url[] = 'http://www.monster.com/';
url[] = 'http://www.timesonline.co.uk/';
url[] = 'http://www.careerbuilder.com/';
url[] = 'http://www.icq.com/';
url[] = 'http://www.abcnews.go.com/';
url[] = 'http://www.tmz.com/';
url[] = 'http://www.fedex.com/';
url[] = 'http://www.informer.com/';
url[] = 'http://www.snopes.com/';
url[] = 'http://www.urbandictionary.com/';
url[] = 'http://www.slashdot.org/';
url[] = 'http://www.php.net/';
url[] = 'http://www.intuit.com/';
url[] = 'http://www.thesun.co.uk/';

[page_rules]

[indexing_plugins]
ViewGit