diff --git a/bin/fetcher.php b/bin/fetcher.php index 72d732d67..a723caf39 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -1568,6 +1568,7 @@ class Fetcher implements CrawlConstants } $meta_ids[] = 'info:'.$site[self::URL]; $meta_ids[] = 'info:'.crawlHash($site[self::URL]); + $meta_ids[] = 'pages:all'; foreach($site[self::IP_ADDRESSES] as $address) { $meta_ids[] = 'ip:'.$address; diff --git a/configs/config.php b/configs/config.php index dfc1b95fc..dcd7e8ca4 100755 --- a/configs/config.php +++ b/configs/config.php @@ -325,7 +325,7 @@ define ('EN_RATIO', 0.9); define ('AD_HOC_TITLE_LENGTH', 10); /** BM25F weight for title text */ -define ('TITLE_WEIGHT', 5); +define ('TITLE_WEIGHT', 4); /** BM25F weight for other text within doc*/ define ('DESCRIPTION_WEIGHT', 1); diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php index 2e9513fbd..6b80b6a48 100644 --- a/lib/index_bundle_iterators/intersect_iterator.php +++ b/lib/index_bundle_iterators/intersect_iterator.php @@ -61,13 +61,6 @@ class IntersectIterator extends IndexBundleIterator */ var $num_iterators; - /** - * The number of documents in the current block after filtering - * by restricted words - * @var int - */ - var $count_block; - /** * The number of iterated docs before the restriction test * @var int @@ -103,7 +96,6 @@ class IntersectIterator extends IndexBundleIterator */ for($i = 0; $i < $this->num_iterators; $i++) { $this->num_docs += $this->index_bundle_iterators[$i]->num_docs; - $this->index_bundle_iterators[$i]->setResultsPerBlock(1); } $this->reset(); } @@ -150,7 +142,6 @@ class IntersectIterator extends IndexBundleIterator */ function findDocsWithWord() { - $pages = array(); $status = $this->syncGenDocOffsetsAmongstIterators(); if($status == -1) { @@ -255,7 +246,7 @@ class IntersectIterator extends IndexBundleIterator } /** - * Finds the next generation and doc offet amongst all the iterators + * Finds the next generation and doc offset amongst all the iterators * that contains the word. It assumes that the (generation, doc offset) * pairs are ordered in an increasing fashion for the underlying iterators */ @@ -263,7 +254,6 @@ class IntersectIterator extends IndexBundleIterator { $biggest_gen_offset = $this->index_bundle_iterators[ 0]->currentGenDocOffsetWithWord(); - $all_same = true; for($i = 0; $i < $this->num_iterators; $i++) { $cur_gen_doc_offset = @@ -287,10 +277,10 @@ class IntersectIterator extends IndexBundleIterator } $last_changed = -1; $i = 0; - while($i != $last_changed) { + while($i != $last_changed) { if($last_changed == -1) $last_changed = 0; if($this->genDocOffsetCmp($gen_doc_offset[$i], - $biggest_gen_offset) < 0) { + $biggest_gen_offset) < 0) { $iterator = $this->index_bundle_iterators[$i]; $iterator->advance($biggest_gen_offset); $cur_gen_doc_offset = @@ -311,6 +301,7 @@ class IntersectIterator extends IndexBundleIterator $i = 0; } } + return 1; } @@ -352,7 +343,7 @@ class IntersectIterator extends IndexBundleIterator */ function currentGenDocOffsetWithWord() { $this->syncGenDocOffsetsAmongstIterators(); - $this->index_bundle_iterators[0]->currentGenDocOffsetWithWord(); + return $this->index_bundle_iterators[0]->currentGenDocOffsetWithWord(); } /** @@ -378,8 +369,10 @@ class IntersectIterator extends IndexBundleIterator * a block */ function setResultsPerBlock($num) { - trigger_error("Cannot set the results per block of - an intersect iterator", E_USER_ERROR); + if($num != 1) { + trigger_error("Cannot set the results per block of + an intersect iterator", E_USER_ERROR); + } } } ?> diff --git a/lib/index_bundle_iterators/negation_iterator.php b/lib/index_bundle_iterators/negation_iterator.php new file mode 100644 index 000000000..8c83b2a0e --- /dev/null +++ b/lib/index_bundle_iterators/negation_iterator.php @@ -0,0 +1,255 @@ +<?php +/** + * SeekQuarry/Yioop -- + * Open Source Pure PHP Search Engine, Crawler, and Indexer + * + * Copyright (C) 2009, 2010, 2011 Chris Pollett chris@pollett.org + * + * LICENSE: + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * END LICENSE + * + * @author Chris Pollett chris@pollett.org + * @package seek_quarry + * @subpackage iterator + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009, 2010, 2011 + * @filesource + */ + +if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} + + +/** + *Loads base class for iterating + */ +require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php'; + +/** + * Used to iterate over the documents which dont' occur in a set of + * iterator results + * + * @author Chris Pollett + * @package seek_quarry + * @subpackage iterator + * @see IndexArchiveBundle + */ +class NegationIterator extends IndexBundleIterator +{ + /** + * An array of iterators whose interection we get documents from + * @var array + */ + var $index_bundle_iterators; + /** + * Number of elements in $this->index_bundle_iterators + * @var int + */ + var $num_iterators; + /** + * Index of the iterator amongst those we are intersecting to advance + * next + * @var int + */ + var $to_advance_index; + + /** + * Creates a negation iterator with the given parameters. + * + * @param object $index_bundle_iterator to use as a source of documents + * to iterate over + */ + function __construct($index_bundle_iterator) + { + $this->index_bundle_iterators[0] = new WordIterator( + crawlHash("pages:all", true), + $index_bundle_iterator->index, true, + $index_bundle_iterator->filter); + $this->index_bundle_iterators[1] = $index_bundle_iterator; + + $this->num_iterators = 2; + $this->num_docs = 0; + $this->results_per_block = 1; + + $this->num_docs = $this->index_bundle_iterators[0]->num_docs; + + $this->reset(); + } + + /** + * Returns the iterators to the first document block that it could iterate + * over + */ + function reset() + { + for($i = 0; $i < $this->num_iterators; $i++) { + $this->index_bundle_iterators[$i]->setResultsPerBlock(1); + $this->index_bundle_iterators[$i]->reset(); + } + + $this->seen_docs = 0; + $this->seen_docs_unfiltered = 0; + + } + + /** + * Computes a relevancy score for a posting offset with respect to this + * iterator and generation + * @param int $generation the generation the posting offset is for + * @param int $posting_offset an offset into word_docs to compute the + * relevance of + * @return float a relevancy score based on BM25F. + */ + function computeRelevance($generation, $posting_offset) + { + return 1; + } + + /** + * Hook function used by currentDocsWithWord to return the current block + * of docs if it is not cached + * + * @return mixed doc ids and rank if there are docs left, -1 otherwise + */ + function findDocsWithWord() + { + + $status = $this->syncGenDocOffsetsAmongstIterators(); + + if($status == -1) { + return -1; + } + //next we finish computing BM25F + $docs = $this->index_bundle_iterators[0]->currentDocsWithWord(); + + if(is_array($docs) && count($docs) == 1) { + //we get intersect docs one at a time so should be only one + $keys = array_keys($docs); + $key = $keys[0]; + + $docs[$key][self::RELEVANCE] = 1; + $docs[$key][self::PROXIMITY] = 1; + + $docs[$key][self::SCORE] = $docs[$key][self::DOC_RANK] * + $docs[$key][self::RELEVANCE] * $docs[$key][self::PROXIMITY]; + } + $this->count_block = count($docs); + $this->pages = $docs; + return $docs; + } + + + /** + * Finds the next generation and doc offset amongst the all docs iterator + * and the term to be negated iterator such that the all iterator is + * strictly less than the term iterator. + */ + function syncGenDocOffsetsAmongstIterators() + { + $changed_term = false; + $changed_all = false; + do { + $gen_offset_all = $this->index_bundle_iterators[ + 0]->currentGenDocOffsetWithWord(); + if($gen_offset_all == -1 || ($changed_all && + $this->genDocOffsetCmp($gen_offset_all, + $old_gen_offset_all) == 0)) { + return -1; + } + $gen_offset_term = + $this->index_bundle_iterators[ + 1]->currentGenDocOffsetWithWord(); + if($gen_offset_term == -1 || ($changed_term && + $this->genDocOffsetCmp($gen_offset_term, + $old_gen_offset_term) == 0)) { + return -1; + } + $gen_doc_cmp = $this->genDocOffsetCmp($gen_offset_all, + $gen_offset_term); + if($gen_doc_cmp > 0) { + $this->index_bundle_iterators[1]->advance($gen_offset_all); + $old_gen_offset_term = $gen_offset_term; + $changed_term = true; + $changed_all = false; + } else if($gen_doc_cmp == 0) { + $this->index_bundle_iterators[0]->advance($gen_offset_term); + $old_gen_offset_all = $gen_offset_all; + $changed_term = false; + $changed_all = true; + } + } while($gen_doc_cmp >= 0); + + return 1; + } + + /** + * Forwards the iterator one group of docs + * @param array $gen_doc_offset a generation, doc_offset pair. If set, + * the must be of greater than or equal generation, and if equal the + * next block must all have $doc_offsets larger than or equal to + * this value + */ + function advance($gen_doc_offset = null) + { + $this->advanceSeenDocs(); + + $this->index_bundle_iterators[0]->advance($gen_doc_offset); + + } + + /** + * Gets the doc_offset and generation for the next document that + * would be return by this iterator + * + * @return mixed an array with the desired document offset + * and generation; -1 on fail + */ + function currentGenDocOffsetWithWord() { + $this->syncGenDocOffsetsAmongstIterators(); + return $this->index_bundle_iterators[0]->currentGenDocOffsetWithWord(); + } + + /** + * Returns the index associated with this iterator + * @return object the index + */ + function getIndex($key = NULL) + { + return $this->index_bundle_iterators[0]->getIndex($key = NULL); + } + + /** + * This method is supposed to set + * the value of the result_per_block field. This field controls + * the maximum number of results that can be returned in one go by + * currentDocsWithWord(). This method cannot be consistently + * implemented for this iterator and expect it to behave nicely + * it this iterator is used together with union_iterator. So + * to prevent a user for doing this, calling this method results + * in a user defined error + * + * @param int $num the maximum number of results that can be returned by + * a block + */ + function setResultsPerBlock($num) { + if($num != 1) { + trigger_error("Cannot set the results per block of + a negation iterator", E_USER_ERROR); + } + } +} +?> diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php index c2fcc05dd..b1ba81c93 100644 --- a/lib/index_bundle_iterators/phrase_filter_iterator.php +++ b/lib/index_bundle_iterators/phrase_filter_iterator.php @@ -40,14 +40,12 @@ require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php'; /** * Used to iterate through a collection of documents to return only those - * which have certain restricted_phrases and don't have disallowed_phrases. + * which have certain restricted_phrases. * * For restricted_phrases a string like "Chris * Homepage" will match any * string where * has been replace by any other string. So for example it will * match Chris Pollett's Homepage. * - * disallowed_phrases are really just disallowed words and must be an exact - * match * * @author Chris Pollett * @package seek_quarry @@ -69,12 +67,6 @@ class PhraseFilterIterator extends IndexBundleIterator */ var $restrict_phrases; - /** - * This iterator returns only documents not containing any the elements of - * disallow phrases - * @var array - */ - var $disallow_phrases; /** * The number of documents in the current block before filtering * by restricted words @@ -109,18 +101,14 @@ class PhraseFilterIterator extends IndexBundleIterator * @param array $restrict_phrases this iterator returns only documents from * $index_bundle_iterator containing all the elements of restrict * phrases - * @param array $disallow_phrases this iterator returns only documents from - * $index_bundle_iterator not containing any of the words in disallow - * phrases * @param float $weight a quantity to multiply each score returned from * this iterator with */ function __construct($index_bundle_iterator, $restrict_phrases, - $disallow_phrases, $weight = 1) + $weight = 1) { $this->index_bundle_iterator = $index_bundle_iterator; $this->restrict_phrases = $restrict_phrases; - $this->disallow_phrases = $disallow_phrases; $this->num_docs = $this->index_bundle_iterator->num_docs; $this->results_per_block = $this->index_bundle_iterator->results_per_block; @@ -199,15 +187,7 @@ class PhraseFilterIterator extends IndexBundleIterator } } } - if($this->disallow_phrases != NULL && - is_array($this->disallow_phrases)) { - foreach($this->disallow_phrases as $phrase) { - if(strlen($phrase) > 0 && - mb_eregi($phrase, $page_string) !== false) { - $found = false; - } - } - } + if($found == true) { $doc_info["WEIGHT"] = $this->weight; $doc_info[self::SCORE] *= $this->weight; diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php index 695a9ce36..de0f54f08 100644 --- a/lib/index_bundle_iterators/union_iterator.php +++ b/lib/index_bundle_iterators/union_iterator.php @@ -65,20 +65,12 @@ class UnionIterator extends IndexBundleIterator * @var int */ var $num_iterators; - /** * The number of documents in the current block before filtering * by restricted words * @var int */ var $count_block_unfiltered; - /** - * The number of documents in the current block after filtering - * by restricted words - * @var int - */ - var $count_block; - /** * The number of iterated docs before the restriction test * @var int diff --git a/models/phrase_model.php b/models/phrase_model.php index 114264b03..8b8016bab 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -41,7 +41,10 @@ if(!defined("POST_PROCESSING")) { define("LOG_TO_FILES", false); } /** For crawlHash function */ -require_once BASE_DIR."/lib/utility.php"; +require_once BASE_DIR."/lib/utility.php"; +/** For extractPhrasesAndCount function */ +require_once BASE_DIR."/lib/phrase_parser.php"; + /** * Used to look up words and phrases in the inverted index * associated with a given crawl @@ -454,7 +457,7 @@ class PhraseModel extends Model $meta_words = array('link:', 'site:', 'version:', 'modified:', 'filetype:', 'info:', '\-', 'os:', 'server:', 'date:', 'index:', 'i:', 'ip:', 'weight:', 'w:', 'u:', - 'lang:', 'media:', 'elink:'); + 'lang:', 'media:', 'elink:', 'pages:'); if(isset($this->additional_meta_words)) { $meta_words = array_merge($meta_words, array_keys( $this->additional_meta_words)); @@ -523,7 +526,7 @@ class PhraseModel extends Model $phrase_string = $words[0]; $phrase_hash = crawlHash($phrase_string); $word_struct = array("KEYS" => array($phrase_hash), - "RESTRICT_PHRASES" => NULL, "DISALLOW_PHRASES" => NULL, + "RESTRICT_PHRASES" => NULL, "DISALLOW_KEYS" => array(), "WEIGHT" => $weight, "INDEX_ARCHIVE" => $index_archive ); } else { @@ -543,8 +546,7 @@ class PhraseModel extends Model $hashes = array(); foreach($words as $word) { - $tmp = crawlHash($word); - $hashes[] = $tmp; + $hashes[] = crawlHash($word); } $restrict_phrases = $quoteds; @@ -560,11 +562,19 @@ class PhraseModel extends Model $word_keys = NULL; $word_struct = NULL; } + $disallow_keys = array(); + $num_disallow_keys = min(5, count($disallow_phrases)); + for($i = 0; $i < $num_disallow_keys; $i++) { + $disallow_stem=array_keys(PhraseParser::extractPhrasesAndCount( + $disallow_phrases[$i], 2, getLocaleTag())); + //stemmed + $disallow_keys[] = crawlHash($disallow_stem[0]); + } if($word_keys !== NULL) { $word_struct = array("KEYS" => $word_keys, "RESTRICT_PHRASES" => $restrict_phrases, - "DISALLOW_PHRASES" => $disallow_phrases, + "DISALLOW_KEYS" => $disallow_keys, "WEIGHT" => $weight, "INDEX_ARCHIVE" => $index_archive ); @@ -650,7 +660,7 @@ class PhraseModel extends Model foreach($word_structs as $word_struct) { $mem_tmp .= serialize($word_struct["KEYS"]). serialize($word_struct["RESTRICT_PHRASES"]) . - serialize($word_struct["DISALLOW_PHRASES"]) . + serialize($word_struct["DISALLOW_KEYS"]) . $word_struct["WEIGHT"] . $word_struct["INDEX_ARCHIVE"]->dir_name; } @@ -755,7 +765,7 @@ class PhraseModel extends Model if(!is_array($word_struct)) { continue;} $word_keys = $word_struct["KEYS"]; $restrict_phrases = $word_struct["RESTRICT_PHRASES"]; - $disallow_phrases = $word_struct["DISALLOW_PHRASES"]; + $disallow_keys = $word_struct["DISALLOW_KEYS"]; $index_archive = $word_struct["INDEX_ARCHIVE"]; $weight = $word_struct["WEIGHT"]; @@ -769,17 +779,29 @@ class PhraseModel extends Model new WordIterator($word_keys[$i], $index_archive, false, $filter); } + $num_disallow_keys = count($disallow_keys); + if($num_disallow_keys > 0) { + for($i = 0; $i < $num_disallow_keys; $i++) { + $disallow_iterator = + new WordIterator($disallow_keys[$i], $index_archive, + false, $filter); + $word_iterators[$num_word_keys + $i] = + new NegationIterator($disallow_iterator); + } + } + $num_word_keys += $num_disallow_keys; + if($num_word_keys == 1) { $base_iterator = $word_iterators[0]; } else { $base_iterator = new IntersectIterator($word_iterators); } - if($restrict_phrases == NULL && $disallow_phrases == NULL && + if($restrict_phrases == NULL && $disallow_keys == array() && $weight == 1) { $iterators[] = $base_iterator; } else { $iterators[] = new PhraseFilterIterator($base_iterator, - $restrict_phrases, $disallow_phrases, $weight); + $restrict_phrases, $weight); } }