diff --git a/bin/fetcher.php b/bin/fetcher.php index 14be82124..0051b5caf 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -1157,16 +1157,17 @@ class Fetcher implements CrawlConstants mb_ereg_replace("[[:punct:]]", " ", $link_text); $link_word_counts = PhraseParser::extractPhrasesAndCount($link_text); - $link_shard->addDocumentWords($link_key, 0, + $link_shard->addDocumentWords($link_key, + self::NEEDS_OFFSET_FLAG, $link_word_counts, array()); $meta_ids[] = 'link:'.$url; } } - $index_shard->addDocumentWords($doc_key, 0, $word_counts, - $meta_ids); - + $index_shard->addDocumentWords($doc_key, self::NEEDS_OFFSET_FLAG, + $word_counts, $meta_ids); + $index_shard->appendIndexShard($link_shard); } diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index 7f69091dd..87ff70ccc 100644 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -135,5 +135,8 @@ interface CrawlConstants const RELEVANCE ='an'; const DUPLICATE ='ao'; const META_WORDS ='ap'; + + const NEEDS_OFFSET_FLAG = 0x7FFFFFFE; + const DUPLICATE_FLAG = 0x7FFFFFFF; } ?> diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index 5c55f159e..2835f5e95 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -54,11 +54,6 @@ require_once 'utility.php'; */ require_once 'crawl_constants.php'; -/** - *Loads common constants for word indexing - */ -require_once 'indexing_constants.php'; - /** * Encapsulates a set of web page summaries and an inverted word-index of terms @@ -81,7 +76,7 @@ require_once 'indexing_constants.php'; * @package seek_quarry * @subpackage library */ -class IndexArchiveBundle implements IndexingConstants, CrawlConstants +class IndexArchiveBundle implements CrawlConstants { /** diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index d5ace8fa8..122e5d016 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -33,11 +33,6 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} -/** - *Loads common constants for word indexing - */ -require_once BASE_DIR.'/lib/indexing_constants.php'; - /** *Loads base class for iterating */ diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php index 9bb7fec8c..dbef9bc3b 100644 --- a/lib/index_bundle_iterators/index_bundle_iterator.php +++ b/lib/index_bundle_iterators/index_bundle_iterator.php @@ -33,11 +33,6 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} -/** - *Loads common constants for word indexing - */ -require_once BASE_DIR.'/lib/indexing_constants.php'; - /** * Abstract classed used to model iterating documents indexed in * an IndexArchiveBundle or set of such bundles. @@ -48,7 +43,7 @@ require_once BASE_DIR.'/lib/indexing_constants.php'; * @subpackage iterator * @see IndexArchiveBundle */ -abstract class IndexBundleIterator implements IndexingConstants, CrawlConstants +abstract class IndexBundleIterator implements CrawlConstants { /** diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php index 6bfa74121..d498461dd 100644 --- a/lib/index_bundle_iterators/intersect_iterator.php +++ b/lib/index_bundle_iterators/intersect_iterator.php @@ -33,11 +33,6 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} -/** - *Loads common constants for word indexing - */ -require_once BASE_DIR.'/lib/indexing_constants.php'; - /** *Loads BloomFilterFile to remember things we've already grouped */ diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php index a844c3e9d..fc7fa5fc6 100644 --- a/lib/index_bundle_iterators/phrase_filter_iterator.php +++ b/lib/index_bundle_iterators/phrase_filter_iterator.php @@ -33,11 +33,6 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} -/** - *Loads common constants for word indexing - */ -require_once BASE_DIR.'/lib/indexing_constants.php'; - /** *Loads base class for iterating */ diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php index 4c79ec0a0..9f3df46b7 100644 --- a/lib/index_bundle_iterators/union_iterator.php +++ b/lib/index_bundle_iterators/union_iterator.php @@ -33,11 +33,6 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} -/** - *Loads common constants for word indexing - */ -require_once BASE_DIR.'/lib/indexing_constants.php'; - /** *Loads BloomFilterFile to remember things we've already grouped */ diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php index a1e31358d..93a5fa321 100644 --- a/lib/index_bundle_iterators/word_iterator.php +++ b/lib/index_bundle_iterators/word_iterator.php @@ -33,11 +33,6 @@ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} -/** - *Loads common constants for word indexing - */ -require_once BASE_DIR.'/lib/indexing_constants.php'; - /** *Loads base class for iterating */ diff --git a/lib/index_shard.php b/lib/index_shard.php index 019c18e21..c281cc0ff 100644 --- a/lib/index_shard.php +++ b/lib/index_shard.php @@ -215,7 +215,7 @@ class IndexShard extends PersistentStructure implements CrawlConstants $previous_string = substr($value, 4, 4); $count_array = unpack("N", substr($value, 8, 4)); $count = $count_array[1]; - if($count == 0x7FFFFFFF) { continue; } + if($count == self::DUPLICATE_FLAG) { continue; } $count++; $value = $first_string . pack("N", $this->word_docs_len) . pack("N", $count); @@ -332,7 +332,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants $is_doc = false; $skip_stats = false; - if($item[self::SUMMARY_OFFSET] == 0x7FFFFFFF) { + if($item[self::SUMMARY_OFFSET] == self::DUPLICATE_FLAG || + $item[self::SUMMARY_OFFSET] == self::NEEDS_OFFSET_FLAG) { $skip_stats = true; $item[self::DUPLICATE] = true; } else if(($tmp[1] & self::COMPOSITE_ID_FLAG) !== 0) { @@ -444,8 +445,11 @@ class IndexShard extends PersistentStructure implements CrawlConstants $count_string = substr($value, 8, 4); $tmp = unpack("N", $count_string); $count = $tmp[1]; - if($count == 0x7FFFFFFF || $add_count == 0x7FFFFFFF) { - $new_count = 0x7FFFFFFF; + if($count == self::DUPLICATE_FLAG || + $count == self::NEEDS_OFFSET_FLAG || + $add_count == self::DUPLICATE_FLAG || + $add_count == self::NEEDS_OFFSET_FLAG) { + $new_count =self::DUPLICATE_FLAG; } else { $new_count = $count + $add_count; } @@ -487,7 +491,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants $id = substr($this->doc_infos, $i, 8); $tmp = unpack("N", substr($this->doc_infos, $i + 8, 4)); $offset = $tmp[1]; - if($offset == 0x7FFFFFFF) {continue; }//ignore duplicates + if($offset == self::DUPLICATE_FLAG) {continue; }//ignore duplicates + //notice don't ignore NEEDS_OFFSET_FLAG $comp_flag = 0; if(($offset & self::COMPOSITE_ID_FLAG) !== 0) { //handle link item case @@ -507,8 +512,8 @@ class IndexShard extends PersistentStructure implements CrawlConstants /** * Marks a set of urls as duplicates of urls previously seen * To do this the url's doc_id has associated with a summary - * offset of value 0x7FFFFFFF, and its length is set to - * 0XFFFFFFFF + * offset of value 0x7FFFFFFF (CrawlConstants::DUPLICATE_FLAG), and its + * length is set to 0XFFFFFFFF * * @param array $doc_urls urls to mark as duplicates. */ @@ -516,12 +521,12 @@ class IndexShard extends PersistentStructure implements CrawlConstants { foreach($doc_urls as $duplicate) { $doc_key = crawlHash($duplicate, true); - $this->doc_infos .= $doc_key . pack("N", 0x7FFFFFFF). + $this->doc_infos .= $doc_key . pack("N", self::DUPLICATE_FLAG). pack("N", 0xFFFFFFFF); $word_key = crawlHash("info:".$duplicate, true); $this->word_docs .= pack("N", ($this->docids_len<< 4)).pack("N",0); $tmp = pack("N", $this->word_docs_len); - $this->words[$word_key] = $tmp.$tmp.pack("N", 0x7FFFFFFF); + $this->words[$word_key] = $tmp.$tmp.pack("N", self::DUPLICATE_FLAG); $this->word_docs_len += 8; $this->docids_len += 16; } diff --git a/lib/indexing_constants.php b/lib/indexing_constants.php deleted file mode 100644 index ba5348276..000000000 --- a/lib/indexing_constants.php +++ /dev/null @@ -1,58 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Chris Pollett chris@pollett.org - * @package seek_quarry - * @subpackage library - * @license http://www.gnu.org/licenses/ GPL3 - * @link http://www.seekquarry.com/ - * @copyright 2009, 2010 - * @filesource - */ - -if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} - -/** - * Enumerative interface for common constants between WordIterator and - * IndexArchiveBundle - * - * These constants are used as fields in arrays. They are negative to - * distinguish them from normal array elements 0, 1, 2... However, this - * means you need to be slightly careful if you try to sort the array - * as this might screw things up - * - * @author Chris Pollett - * @package seek_quarry - * @subpackage library - */ -interface IndexingConstants -{ - const COUNT = -1; - const END_BLOCK = -2; - const LIST_OFFSET = -3; - const POINT_BLOCK = -4; - const PARTIAL_COUNT = -5; - const NAME = -6; -} -?>