Last commit for src/library/CrawlConstants.php: 2addb500315b7393a90fe66431d7832b1e7386c7

Adjust copyrights years

Chris Pollett [2024-01-03 21:Jan:rd]
Adjust copyrights years
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009 - 2023  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @license https://www.gnu.org/licenses/ GPL3
 * @link https://www.seekquarry.com/
 * @copyright 2009 - 2023
 * @filesource
 */
namespace seekquarry\yioop\library;

/**
 * Shared constants and enums used by components that are involved in the
 * crawling process
 *
 * @author Chris Pollett
 */
interface CrawlConstants
{
    /**
     * Used to say what kind of queue_server this is
     */
    const BOTH = "IndexerAndScheduler";
    /**
     * Used to say what kind of queue_server this is
     */
    const INDEXER = "Indexer";
    /**
     * Used to say what kind of queue_server this is
     */
    const SCHEDULER = "Scheduler";
    const queue_base_name = "QueueBundle";
    const archive_base_name = "Archive";
    const name_archive_iterator = "NameArchiveIterator";
    const fetch_archive_iterator = "FetchArchiveIterator";
    const save_point = "SavePoint";
    const schedule_data_base_name = "ScheduleData";
    const schedule_name = "FetchSchedule"; //obtained from scheduler
    const robot_data_base_name = "RobotData";
    const etag_expires_data_base_name = "EtagExpiresData";
    const index_data_base_name = "IndexData";
    const feed_index_data_base_name = "IndexDataFeed";
    const double_index_base_name = "DoubleIndexData";
    const network_base_name = "Network";
    const network_crawllist_base_name = "NetworkCrawlList";
    const statistics_base_name = "Statistics";
    const index_closed_name = "IndexClosed";
    const fetch_batch_name = "FetchBatch"; //used to continue if fetcher halted
    const fetch_crawl_info = "FetchInfo";
    const fetch_closed_name = "FetchClosed";
    const data_base_name = "At";
    const schedule_start_name = "StartCrawlSchedule.txt";
    const robot_table_name = "robot_table.txt";
    const mirror_table_name = "mirror_table.txt";
    const local_ip_cache_file = "local_ip_cache.txt";
    /** used for word iterator direction  */
    const ASCENDING = 1;
    const DESCENDING = -1;
    /** media feed index archive bundle timestamp */
    const FEED_CRAWL_TIME = 100;
    /** Used in priority queue*/
    const MAX = 1;
    const MIN = -1;
    /** starts of daemon processes*/
    const STOP_STATE = -1;
    const CONTINUE_STATE = 1;
    const NO_DATA_STATE = 2;
    const WAITING_START_MESSAGE_STATE = 3;
    const REDO_STATE = 4;
    const STATUS = 'a';
    const CRAWL_TIME = 'b';
    const HTTP_CODE = 'c';
    const TIMESTAMP = 'd';
    const TYPE = 'e';
    const ENCODING = 'f';
    const SEEN_URLS = 'g';
    const MACHINE = 'h';
    const INVERTED_INDEX = 'i';
    const SAVED_CRAWL_TIMES= 'j';
    const SCHEDULE_TIME = 'k';
    const URL = 'l';
    const WEIGHT = 'm';
    const ROBOT_PATHS = 'n';
    const HASH = 'o';
    const PAGE = 'q';
    const DOC_INFO = 'r';
    const TITLE = 's';
    const DESCRIPTION = 't';
    const THUMB = 'u';
    const CRAWL_DELAY = 'v';
    const LINKS = 'w';
    const ROBOT_TXT = 'x';
    const TO_CRAWL = 'y';
    const INDEX = 'z';
    const DESCRIPTION_SCORES = 'A';
    const HEIGHT = 'B';
    const WIDTH = 'C';
    const ROBOTS_TXT = 'D';
    const DEBUG = "E";
    const DIRECTION = "F";
    const PINNED = "G";
    const SLEEP_START = "H";
    const SLEEP_DURATION = "I";
    const TOP_LEVEL_LINKS = "J";
    const FAVICON_URL = "K";
    const NUM_ICON_IMAGES = "L";
    const DOC_DEPTH = 'M';
    const DOC_RANK = 'N';
    const URL_WEIGHT = 'O';
    const INLINKS = 'P';
    const NEW_CRAWL = 'Q';
    const OFFSET = 'R';
    const PATHS = 'S';
    const HASH_URL = 'T';
    const SUMMARY_OFFSET = 'U';
    const DUMMY = 'V';
    const SITES = 'W';
    const SCORE = 'X';
    const CRAWL_ORDER = 'Y';
    const RESTRICT_SITES_BY_URL = 'Z';
    const ALLOWED_SITES = 'aa';
    const DISALLOWED_SITES = 'ab';
    const BREADTH_FIRST = 'ac';
    const HOST_BUDGETING = 'ad';
    const MACHINE_URI = 'ae';
    const SITE_INFO = 'af';
    const FILETYPE = 'ag';
    const SUMMARY = 'ah';
    const URL_INFO = 'ai';
    const HASH_SEEN_URLS ='aj';
    const RECENT_URLS ='ak';
    const MEMORY_USAGE ='al';
    const DOC_ID ='am';
    const RELEVANCE ='an';
    const PAGE_RULES ='ao';
    const CACHE_PAGE_PARTITION = 'ap';
    const GENERATION = 'aq';
    const AUX_DOCS = 'ar';
    const HASH_URL_COUNT = 'as';
    const IS_DOC = 'at';
    const IP_ADDRESSES = 'au';
    const CLD_IN_COMMON = 'av';
    const JUST_METAS = 'aw';
    const WEB_CRAWL = 'ax';
    const ARCHIVE_CRAWL = 'ay';
    const CRAWL_TYPE = 'az';
    const CRAWL_INDEX = 'ba';
    const HEADER = 'bb';
    const SERVER = 'bc';
    const SERVER_VERSION = 'bd';
    const OPERATING_SYSTEM = 'be';
    const MODIFIED = 'bf';
    const LANG = 'bg';
    const ROBOT_INSTANCE = 'bh';
    const DOC_LEN = 'bi';
    const SUBDOCS = 'bj';
    const SUBDOCTYPE = 'bk';
    const INDEXING_PLUGINS = 'bl';
    const DOMAIN_WEIGHTS = 'bm';
    const POSITION_LIST = 'bn';
    const PROXIMITY = 'bo';
    const LOCATION = 'bp';
    const INDEXED_FILE_TYPES = 'bq';
    const PAGE_RANGE_REQUEST = 'br';
    const PAGE_RECRAWL_FREQUENCY = 'bs';
    const DATA = 'bt';
    const QUEUE_SERVERS = "bu";
    const CURRENT_SERVER = "bv";
    const SIZE = "bw";
    const TOTAL_TIME = "bx";
    const DNS_TIME = "by";
    const AGENT_LIST = "bz";
    const ROBOT_METAS = "ca";
    const ARC_DIR = "cb";
    const ARC_TYPE = "cc";
    const ARC_DATA = "cd";
    const KEY = "ce";
    const MACHINE_ID = 'cf';
    const IS_VIDEO = 'cg';
    const IS_FEED = 'ch';
    const SOURCE_NAME = 'ci';
    const LINK_SEEN_URLS = 'cj';
    const POST_MAX_SIZE = 'ck';
    const LOGGING = 'cl';
    const META_WORDS = 'cm';
    const CACHE_PAGES = 'cn';
    const WARC_ID = 'co';
    const START_PARTITION = 'cp';
    const INI = 'cq';
    const UI_FLAGS = 'cr';
    const KEYWORD_LINKS = 'cs';
    const END_ITERATOR = 'ct';
    const ACTIVE_CLASSIFIERS = 'cu';
    const ACTIVE_CLASSIFIERS_DATA = 'cv';
    const MAX_DESCRIPTION_LEN = 'cw';
    const CACHE_PAGE_VALIDATORS = 'cx';
    const CACHE_PAGE_VALIDATION_DATA = 'cy';
    const NUM_PARTITIONS = 'cz';
    const PARTITION_NUM = 'da';
    const ACTIVE_RANKERS = 'db';
    const USER_RANKS = "dc";
    const INDEXING_PLUGINS_DATA = "dd";
    const REPOSITORY_TYPE = 'de';
    const FILE_NAME = 'df';
    const SHA_HASH = 'dg';
    const TOR_PROXY = 'dh';
    const PROXY_SERVERS = 'di';
    const NEEDS_OFFSET_FLAG = 0x7FFFFFFF;
    const BASIC_SUMMARIZER = 'dk';
    const CENTROID_SUMMARIZER = 'dl';
    const SUMMARIZER_OPTION = 'dm';
    const WORD_CLOUD = 'dn';
    const MAX_LINKS_TO_EXTRACT = 'do';
    const IS_GOPHER_URL = "dp";
    const MINIMUM_FETCH_LOOP_TIME = "dq";
    const IMAGE_LINK = "dr";
    const GRAPH_BASED_SUMMARIZER = 'ds';
    const CENTROID_WEIGHTED_SUMMARIZER = 'dt';
    const SCRAPER_LABEL = 'du';
    const SCRAPERS = 'dv';
    const QUESTION_ANSWERS = 'dw';
    const CONTENT_SIZE = 'dx';
    const NO_RANGE = 'dy';
    const MAX_DEPTH = 'dz';
    const REPEAT_TYPE = 'ea';
    const CHANNEL = 'eb';
    const THUMB_URL = 'ec';
    const IS_VR = 'ed';
    const DURATION = 'ee';
    const PUBDATE = 'ef';
    const SLOW_START = 'eg';
    const IS_SAFE = 'eh';
    const INDEX_VERSION = 'ei';
    const MORE_FILTER_TERMS = "ej";
    const URL_PARENT = "ek";
    const AVERAGE_COLOR = 'el';
    const IS_BLACK_AND_WHITE = 'em';
    const VIEW_COUNT = 'en';
    const PROCESSOR = 'eo';
    const FALLBACK_PROCESSOR = 'ep';
    const SCRAPER_INFO = 'eq';
}
ViewGit