diff --git a/configs/ngram_builder.php b/configs/ngram_builder.php deleted file mode 100644 index 3c2ef7b52..000000000 --- a/configs/ngram_builder.php +++ /dev/null @@ -1,172 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009 - 2012 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * END LICENSE - * - * n grams are sequence of n words which always occur together in the same - * sequence in a user query, ex: "honda accord". Yioop! can treat these - * sequences of words as a single word to increase the speed and efficiency - * of retrieval. This script can be used to create a n word grams filter - * file for the Yioop! search engine to detect such words in documents - * and queries. The input to this script is an xml or xml.bz2 - * Wikipedia dump. Wikipedia dumps are available for download online - * free of cost. The n word grams filter file is specific to a language, - * therefore, the user has to create a separate filter file for each language - * that is to use this functionality. This script can be run multiple times to - * create different filter files by specifying a different input xml files, - * different values for n, and a different language as command line arguments. - * Xml dumps of Wikipedia for different specific languages are available to - * download, and it is these language specific dumps which serve as input to - * this script. - * - * To illustrate the use ngram_build.php, here are the steps to use it - * in the case of wanting to create an English language bigram filter file. - * - * Step 1: Go to http://dumps.wikimedia.org/enwiki/ and obtain a - * dump of the English Wikipedia. This page lists all the dumps according - * to date they were taken. Choose any suitable date or the latest. A - * link with a label such as 20120104/, represents a dump taken on - * 01/04/2012. Click this link to go in turn to a - * page which has many links based on type of content you are looking for. - * We are interested in content titled - * "Recombine all pages, current versions only". Beneath this we might find a - * link with a name like: - * "enwiki-20120104-pages-meta-current.xml.bz2" - * This is a bz2 compressed xml file containing all the English pages of - * Wikipedia. Download the file to the "search_filters" folder of your - * yioop work directory associated with your profile. - * (Note: You should have sufficient hard disk space in the order of - * 100GB to store the compressed dump and script extracted xml. - * The script also accepts an uncompressed XML file as input. - * The filter file generated is a few megabytes.) - * - * Step 2: Run this script from the php command line as follows - * php bigram_builder enwiki-20120104-pages-meta-current.xml.bz2 en 2 1 - * - * This would extract bigrams (the 2) from xml title's (the 1) in the dump - * This creates a bigram filter en_2_grams.ftr for English in the same - * directory. Yioop! will automatically detect the filter file and use - * it the next time you crawl as well as when anyone performs an English - * language query. - * - * - * @author Ravi Dhillon ravi.dhillon@yahoo.com, Chris Pollett (modified for n - * ngrams) - * @package seek_quarry - * @license http://www.gnu.org/licenses/ GPL3 - * @link http://www.seekquarry.com/ - * @copyright 2009 - 2012 - * @filesource - */ - -if(php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();} - -ini_set("memory_limit","1024M"); -/** - * Calculate base directory of script - * @ignore - */ -define("BASE_DIR", substr( - dirname(realpath($_SERVER['PHP_SELF'])), 0, - -strlen("/configs"))); - -/** Load in global configuration settings */ -require_once BASE_DIR.'/configs/config.php'; - -/** - * n word grams contains generateNWordGramsTextFile and - * and createNWordGramsFilterFile used to create the bloom filter - */ -require_once BASE_DIR."/lib/nword_grams.php"; - -$num_args = count($argv); -if( $num_args < 2 || $num_args > 7) { - echo "\n". - "ngram_builder is used to create a 'n' word gram filter file for\n". - "the Yioop! search engine. This filter file is used to detect when n\n". - "words in a language should be treated as a unit. For example, \n". - "Bill Clinton is 2 word gram. ngram_builder is run from the command\n". - "line as:\n\n". - "php ngram_builder.php wiki_xml lang locale ". - "n extract_type max_to_extract\n\n". - "where wiki_xml is a wikipedia xml file or a bz2 compressed xml\n". - "file whose urls will be used to determine the n-grams, lang\n". - "is an Wikipedia language tag, locale is the IANA language tag\n". - "of locale to store the results for (if different from lang, for\n". - " example, en-US versus en for lang), n is the number of words in a\n". - "row to consider, extract_type is where from Wikipedia source to \n". - "extract:\n\n". - " 0 = title's,\n 1 = redirect's,\n 2 = page count dump wikipedia ". - "data,\n 3 = page count dump wiktionary data.\n\n"; - exit(); -} -if(!isset($argv[2])) { - $argv[2] = "en"; - $argv[3] = "en-US"; -} -if(!isset($argv[3])) { - $argv[3] = $argv[2]; -} -if(!isset($argv[4])) { - $argv[4] = 2; // bigrams -} -if(!isset($argv[5])) { - $argv[5] = NWordGrams::PAGE_COUNT_WIKIPEDIA; -} -if(!isset($argv[6]) && $argv[3] == "all" && - $argv[3] == NWordGrams::PAGE_COUNT_WIKIPEDIA) { - $argv[6] = 400000; -} else { - $argv[6] = -1; -} - -if(!PROFILE) { - echo "Please configure the search engine instance ". - "by visiting its web interface on localhost.\n"; - exit(); -} - - -$wiki_file_path = PREP_DIR."/"; -if (!file_exists($wiki_file_path.$argv[1])) { - echo $argv[1]." does not exist in $wiki_file_path"; - exit(); -} - -/* - *This call creates a ngrams text file from input xml file and - *returns the count of ngrams in the text file. - */ -list($num_ngrams, $max_gram_len) = - NWordGrams::generateNWordGramsTextFile($argv[1], $argv[2], $argv[3], - $argv[4], $argv[5], $argv[6]); - -/* - *This call creates a bloom filter file from n word grams text file based - *on the language specified.The lang passed as parameter is prefixed - *to the filter file name. The count of n word grams in text file is passed - *as a parameter to set the limit of n word grams in the filter file. - */ -NWordGrams::createNWordGramsFilterFile($argv[3], $argv[4], $num_ngrams, - $max_gram_len); - -?> diff --git a/controllers/resource_controller.php b/controllers/resource_controller.php index 245658174..ac5fc1e0d 100644 --- a/controllers/resource_controller.php +++ b/controllers/resource_controller.php @@ -143,7 +143,8 @@ class ResourceController extends Controller implements CrawlConstants if($count != 1) {return;} $path = LOCALE_DIR."/$locale/resources/suggest_trie.txt.gz"; if(file_exists($path)) { - header("Content-Encoding: x-gzip"); + header("Content-Type: plain/text"); + header("Content-Encoding: gzip"); readfile($path); } } diff --git a/css/search.css b/css/search.css index 59bc4d047..b19727540 100755 --- a/css/search.css +++ b/css/search.css @@ -550,7 +550,7 @@ p .setting-footer, .signin-exit, .landing-footer { - margin-top: 1in; + margin-top: 0.25in; text-align: center; }