diff --git a/bin/fetcher.php b/bin/fetcher.php index 0051b5caf..f6b6119e8 100755 --- a/bin/fetcher.php +++ b/bin/fetcher.php @@ -305,7 +305,7 @@ class Fetcher implements CrawlConstants crawlLog("Old name: ".$this->web_archive->dir_name); } $this->web_archive = new WebArchiveBundle($tmp_base_name, - URL_FILTER_SIZE, NUM_ARCHIVE_PARTITIONS); + URL_FILTER_SIZE); $this->crawl_time = $info[self::CRAWL_TIME]; $this->sum_seen_title_length = 0; $this->sum_seen_description_length = 0; @@ -740,7 +740,7 @@ class Fetcher implements CrawlConstants $i++; } } // end for - $stored_site_pages = $this->web_archive->addPages(self::HASH, + $cache_page_partition = $this->web_archive->addPages( self::OFFSET, $stored_site_pages); $num_pages = count($stored_site_pages); @@ -750,6 +750,8 @@ class Fetcher implements CrawlConstants if(isset($stored_site_pages[$i][self::OFFSET])) { $summarized_site_pages[$i][self::OFFSET] = $stored_site_pages[$i][self::OFFSET]; + $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = + $cache_page_partition; } } diff --git a/bin/queue_server.php b/bin/queue_server.php index e666d3fe9..5d2037eeb 100755 --- a/bin/queue_server.php +++ b/bin/queue_server.php @@ -416,8 +416,7 @@ class QueueServer implements CrawlConstants $this->index_archive = new IndexArchiveBundle( CRAWL_DIR.'/cache/'. self::index_data_base_name.$this->crawl_time, - URL_FILTER_SIZE, NUM_ARCHIVE_PARTITIONS, - serialize($info)); + URL_FILTER_SIZE, serialize($info)); } else { $dir = CRAWL_DIR.'/cache/'. self::index_data_base_name.$this->crawl_time; @@ -589,11 +588,14 @@ class QueueServer implements CrawlConstants } } - if(isset($seen_sites)) { - $seen_sites = - $this->index_archive->addPages( - self::HASH_URL, self::SUMMARY_OFFSET, $seen_sites, - $visited_urls_count); + if(isset($seen_sites) && isset($sites[self::INVERTED_INDEX])) { + $index_shard = & $sites[self::INVERTED_INDEX]; + $generation = + $this->index_archive->initGenerationToAdd($index_shard); + + $this->index_archive->addPages( + $generation, self::SUMMARY_OFFSET, $seen_sites, + $visited_urls_count); $summary_offsets = array(); foreach($seen_sites as $site) { @@ -604,16 +606,13 @@ class QueueServer implements CrawlConstants " time: ".(changeInMicrotime($start_time))); $start_time = microtime(); // added summary offset info to inverted index data - if(isset($sites[self::INVERTED_INDEX])) { - $index_shard = & $sites[self::INVERTED_INDEX]; - $index_shard->changeDocumentOffsets($summary_offsets); - } - } - crawlLog("C (update shard offsets) memory usage".memory_get_usage() . - " time: ".(changeInMicrotime($start_time))); - $start_time = microtime(); - if(isset($index_shard)) { + $index_shard->changeDocumentOffsets($summary_offsets); + + crawlLog("C (update shard offsets) memory usage".memory_get_usage(). + " time: ".(changeInMicrotime($start_time))); + $start_time = microtime(); + $this->index_archive->addIndexData($index_shard); $this->index_dirty = true; } diff --git a/controllers/archive_controller.php b/controllers/archive_controller.php index de00247aa..12a6a5063 100755 --- a/controllers/archive_controller.php +++ b/controllers/archive_controller.php @@ -98,8 +98,9 @@ class ArchiveController extends Controller implements CrawlConstants { $web_archive = new WebArchiveBundle( CRAWL_DIR.'/cache/'.self::archive_base_name. - $_REQUEST['crawl_time'], -1); - $page = $web_archive->getPage($_REQUEST['hash'], $_REQUEST['offset']); + $_REQUEST['crawl_time']); + $page = $web_archive->getPage($_REQUEST['offset'], + $_REQUEST['partition']); echo base64_encode(serialize($page)); } diff --git a/controllers/search_controller.php b/controllers/search_controller.php index 716ab82ee..688722bea 100755 --- a/controllers/search_controller.php +++ b/controllers/search_controller.php @@ -172,8 +172,12 @@ class SearchController extends Controller implements CrawlConstants if(isset($_REQUEST['so'])) { $summary_offset = $this->clean($_REQUEST['so'], "int"); } - $this->cacheRequest($query, $arg, $summary_offset, $highlight, - $index_time_stamp); + $generation = -1; + if(isset($_REQUEST['g'])) { + $generation = $this->clean($_REQUEST['g'], "int"); + } + $this->cacheRequest($query, $arg, $summary_offset, $generation, + $highlight, $index_time_stamp); } } @@ -226,12 +230,16 @@ class SearchController extends Controller implements CrawlConstants if(isset($_REQUEST['so'])) { $summary_offset = $this->clean($_REQUEST['so'], "int"); } - if($summary_offset === NULL) { - $summary_offset = - $this->phraseModel->lookupSummaryOffset($url); + $generation = -1; + if(isset($_REQUEST['g'])) { + $generation = $this->clean($_REQUEST['g'], "int"); + } + if($summary_offset === NULL || $generation == -1) { + list($summary_offset, $generation) = + $this->phraseModel->lookupSummaryOffsetGeneration($url); } - $crawl_item = $this->crawlModel->getCrawlItem( - crawlHash($url, true), $summary_offset); + $crawl_item = $this->crawlModel->getCrawlItem($summary_offset, + $generation); $top_phrases = $this->phraseModel->getTopPhrases($crawl_item, 3); @@ -318,7 +326,7 @@ class SearchController extends Controller implements CrawlConstants * @param int $crawl_time the timestamp of the crawl to look up the cached * page in */ - function cacheRequest($query, $url, $summary_offset, + function cacheRequest($query, $url, $summary_offset = -1, $generation = -1, $highlight=true, $crawl_time = 0) { @@ -328,24 +336,26 @@ class SearchController extends Controller implements CrawlConstants $this->phraseModel->index_name = $crawl_time; $this->crawlModel->index_name = $crawl_time; - if($summary_offset === NULL) { - $summary_offset = $this->phraseModel->lookupSummaryOffset($url); + if($summary_offset == -1 || $generation == -1) { + list($summary_offset, $generation) = + $this->phraseModel->lookupSummaryOffsetGeneration($url); } - if(!$crawl_item = $this->crawlModel->getCrawlItem(crawlHash($url, true), - $summary_offset)) { + $data = array(); + if(!$crawl_item = $this->crawlModel->getCrawlItem($summary_offset, + $generation)) { $this->displayView("nocache", $data); exit(); } - $data = array(); + $machine = $crawl_item[self::MACHINE]; $machine_uri = $crawl_item[self::MACHINE_URI]; $page = $crawl_item[self::HASH]; $offset = $crawl_item[self::OFFSET]; $cache_item = $this->crawlModel->getCacheFile($machine, - $machine_uri, $page, $offset, $crawl_time); + $machine_uri, $generation, $offset, $crawl_time); $cache_file = $cache_item[self::PAGE]; diff --git a/lib/bst_array.php b/lib/bst_array.php deleted file mode 100644 index 9e41f3c5f..000000000 --- a/lib/bst_array.php +++ /dev/null @@ -1,149 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Chris Pollett chris@pollett.org - * @package seek_quarry - * @subpackage library - * @license http://www.gnu.org/licenses/ GPL3 - * @link http://www.seekquarry.com/ - * @copyright 2009, 2010 - * @filesource - */ -if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} - -/** - * Load charCopy - */ -require_once "utility.php"; - -/** - * - * @author Chris Pollett - * - * @package seek_quarry - * @subpackage library - */ - -class BSTArray -{ - var $data; - var $data_len; - var $key_len; - var $value_len; - var $entry_len; - var $key_compare; - - /** - * - */ - function __construct($key_len, $value_len, $key_compare) - { - $this->data = ""; - $this->data_len = 0; - $this->key_len = $key_len; - $this->value_len = $value_len; - $this->entry_len = $key_len + $value_len + 8; - $this->key_compare = $key_compare; - } - - /** - * - */ - function insertUpdate($key, $value) - { - $key_compare = $this->key_compare; - if($this->contains($key, $offset, $parent_offset)) - { - list(, , $left_offset, $right_offset) = $this->readEntry($offset); - charCopy($key . $value . pack("N",$left_offset) . - pack("N", $right_offset),$this->data, $offset,$this->entry_len); - } else { - if($parent_offset != $offset) { // data already exists - list($parent_key, $parent_value, $parent_left_offset, - $parent_right_offset) = $this->readEntry($parent_offset); - if($key_compare($parent_key, $key) < 0 ) { - $parent_right_offset = $offset; - } else { - $parent_left_offset = $offset; - } - $new_parent_entry = $parent_key . $parent_value . - pack("N", $parent_left_offset) . - pack("N", $parent_right_offset); - charCopy( $new_parent_entry, - $this->data, $parent_offset, $this->entry_len); - } - $this->data .= $key . $value . pack("H*", "7FFFFFFF7FFFFFFF"); - $this->data_len += $this->entry_len; - } - } - - /** - * - */ - function contains($key, &$offset, &$parent_offset) - { - $offset = 0; - $parent_offset = 0; - $data_len = $this->data_len; - $entry_len = $this->entry_len; - $last_entry = $data_len - $entry_len; - $key_compare = $this->key_compare; - while($offset <= $last_entry ) { - list($cur_key, , $left_offset, $right_offset) = - $this->readEntry($offset); - $comparison = $key_compare($cur_key, $key); - if($comparison == 0) { - return true; - } else if ($comparison < 0) { - $parent_offset = $offset; - $offset = $right_offset; - } else { - $parent_offset = $offset; - $offset = $left_offset; - } - } - - $offset = $data_len; - return false; - } - - /** - * - */ - function readEntry($offset) - { - $key = substr($this->data, $offset, $this->key_len); - $offset += $this->key_len; - $value = substr($this->data, $offset, $this->value_len); - $offset += $this->value_len; - $left_string = substr($this->data, $offset, 4); - $tmp = unpack("N", $left_string); - $left_offset = $tmp[1]; - $offset += 4; - $right_string = substr($this->data, $offset, 4); - $tmp = unpack("N", $right_string); - $right_offset = $tmp[1]; - return array($key, $value, $left_offset, $right_offset); - } -} diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php index 87ff70ccc..49cb8accf 100644 --- a/lib/crawl_constants.php +++ b/lib/crawl_constants.php @@ -138,5 +138,6 @@ interface CrawlConstants const NEEDS_OFFSET_FLAG = 0x7FFFFFFE; const DUPLICATE_FLAG = 0x7FFFFFFF; + const CACHE_PAGE_PARTITION = 'aq'; } ?> diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php index 2835f5e95..228a0872a 100644 --- a/lib/index_archive_bundle.php +++ b/lib/index_archive_bundle.php @@ -127,14 +127,11 @@ class IndexArchiveBundle implements CrawlConstants * the WebArchiveBundles * @param int $num_partitions_summaries number of WebArchive partitions * to use in the summmaries WebArchiveBundle - * @param int $num_partitions_index number of WebArchive partitions - * to use in the index WebArchiveBundle * @param string $description a text name/serialized info about this * IndexArchiveBundle */ public function __construct($dir_name, $filter_size = -1, - $num_partitions_summaries = NULL, $description = NULL, - $num_docs_per_generation = NUM_DOCS_PER_GENERATION) + $description = NULL, $num_docs_per_generation = NUM_DOCS_PER_GENERATION) { $this->dir_name = $dir_name; @@ -147,7 +144,6 @@ class IndexArchiveBundle implements CrawlConstants $index_archive_exists = true; } - if(file_exists($this->dir_name."/generation.txt")) { $this->generation_info = unserialize( file_get_contents($this->dir_name."/generation.txt")); @@ -157,11 +153,9 @@ class IndexArchiveBundle implements CrawlConstants serialize($this->generation_info)); } $this->summaries = new WebArchiveBundle($dir_name."/summaries", - $filter_size, $num_partitions_summaries, $description); + $filter_size, -1, $description); $this->summaries->initCountIfNotExists("VISITED_URLS_COUNT"); - $this->num_partitions_summaries = $this->summaries->num_partitions; - $this->description = $this->summaries->description; $this->num_docs_per_generation = $num_docs_per_generation; @@ -170,36 +164,52 @@ class IndexArchiveBundle implements CrawlConstants /** * Add the array of $pages to the summaries WebArchiveBundle pages being - * stored in the partition according to the $key_field and the field used + * stored in the partition $generation and the field used * to store the resulting offsets given by $offset_field. * - * @param string $key_field field used to select partition + * @param int $generation field used to select partition * @param string $offset_field field used to record offsets after storing * @param array &$pages data to store * @param int $visited_urls_count number to add to the count of visited urls * (visited urls is a smaller number than the total count of objects * stored in the index). - * @return array $pages adjusted with offset field */ - public function addPages($key_field, $offset_field, $pages, + public function addPages($generation, $offset_field, &$pages, $visited_urls_count) { - $result = $this->summaries->addPages($key_field, $offset_field, $pages); + $this->summaries->setWritePartition($generation); + $this->summaries->addPages($offset_field, $pages); $this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT"); - return $result; } /** * Adds the provided mini inverted index data to the IndexArchiveBundle * - * @param array $index_data a mini inverted index of word_key=>doc data + * @param object &$index_shard a mini inverted index of word_key=>doc data * to add to this IndexArchiveBundle */ - public function addIndexData($index_shard) + public function addIndexData(&$index_shard) { crawlLog("**ADD INDEX DIAGNOSTIC INFO..."); $start_time = microtime(); + + $this->getActiveShard()->appendIndexShard($index_shard); + crawlLog("Append Index Shard: Memory usage:".memory_get_usage() . + " Time: ".(changeInMicrotime($start_time))); + } + + /** + * Determines based on its size, if index_shard should be added to + * the active generation or in a new generation should be started. + * If so, a new generation is started. + * + * @param object &$index_shard a mini inverted index of word_key=>doc data + * @return int the active generation after the check and possible change has + * been performed + */ + public function initGenerationToAdd(&$index_shard) + { $current_num_docs = $this->getActiveShard()->num_docs; $add_num_docs = $index_shard->num_docs; if($current_num_docs + $add_num_docs > $this->num_docs_per_generation){ @@ -217,9 +227,8 @@ class IndexArchiveBundle implements CrawlConstants serialize($this->generation_info)); crawlLog("Switch Shard time:".changeInMicrotime($switch_time)); } - $this->getActiveShard()->appendIndexShard($index_shard); - crawlLog("Append Index Shard: Memory usage:".memory_get_usage() . - " Time: ".(changeInMicrotime($start_time))); + + return $this->generation_info['ACTIVE']; } /** @@ -291,17 +300,19 @@ class IndexArchiveBundle implements CrawlConstants /** * Gets the page out of the summaries WebArchiveBundle with the given - * key and offset + * offset and generation * - * The $key determines the partition WebArchive, the $offset give the - * byte offset within that archive. - * @param string $key hash to use to look up WebArchive partition * @param int $offset byte offset in partition of desired page + * @param int $generation which generation WebArchive to look up in + * defaults to the same number as the current shard * @return array desired page */ - public function getPage($key, $offset) + public function getPage($offset, $generation = -1) { - return $this->summaries->getPage($key, $offset); + if($generation == -1 ) { + $generation = $this->generation_info['CURRENT']; + } + return $this->summaries->getPage($offset, $generation); } diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php index 122e5d016..e574d7276 100644 --- a/lib/index_bundle_iterators/group_iterator.php +++ b/lib/index_bundle_iterators/group_iterator.php @@ -285,8 +285,7 @@ class GroupIterator extends IndexBundleIterator foreach($doc_info[self::SUMMARY_OFFSET] as $offset_array) { list($key, $summary_offset) = $offset_array; $index = & $this->getIndex($key); - $page = $index->getPage( - $key, $summary_offset); + $page = $index->getPage($summary_offset); if(!isset($out_pages[$doc_key][self::SUMMARY])) { $out_pages[$doc_key][self::SUMMARY] = $page; } else if (isset($page[self::DESCRIPTION])) { diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php index dbef9bc3b..ed5ef90c9 100644 --- a/lib/index_bundle_iterators/index_bundle_iterator.php +++ b/lib/index_bundle_iterators/index_bundle_iterator.php @@ -154,8 +154,7 @@ abstract class IndexBundleIterator implements CrawlConstants $doc_info = $pages[$doc_key]; } if(isset($doc_info[self::SUMMARY_OFFSET])) { - $page = $index->getPage( - $doc_key, $doc_info[self::SUMMARY_OFFSET]); + $page = $index->getPage($doc_info[self::SUMMARY_OFFSET]); $out_pages[$doc_key] = $doc_info; $out_pages[$doc_key][self::SUMMARY] = $page; } diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php index 25a2df175..76f6e4dd3 100755 --- a/lib/web_archive_bundle.php +++ b/lib/web_archive_bundle.php @@ -91,15 +91,15 @@ class WebArchiveBundle */ var $page_exists_filter_bundle; /** - * Number of WebArchives in the WebArchiveBundle + * Total number of page objects stored by this WebArchiveBundle * @var int */ - var $num_partitions; + var $count; /** - * Total number of page objects stored by this WebArchiveBundle + * The index of the partition to which new documents will be added * @var int */ - var $count; + var $write_partition; /** * A short text name for this WebArchiveBundle * @var string @@ -119,24 +119,27 @@ class WebArchiveBundle * @param string $dir_name folder name of the bundle * @param int $filter_size number of items that can be stored in * a given BloomFilterFile in the $page_exists_filter_bundle - * @param int $num_partitions number of WebArchive's in this bundle + * @param int $num_docs_per_partition number of documents before the + * web archive is changed * @param string $description a short text name/description of this * WebArchiveBundle * @param string $compressor the Compressor object used to * compress/uncompress data stored in the bundle */ function __construct($dir_name, $filter_size = -1, - $num_partitions = NULL, $description = NULL, + $num_docs_per_partition = NUM_DOCS_PER_GENERATION, $description = NULL, $compressor = "GzipCompressor") { //filter size = -1 used by web server to not get all partitions created $this->dir_name = $dir_name; $this->filter_size = $filter_size; + $this->num_docs_per_partition = $num_docs_per_partition; $this->compressor = $compressor; + $this->write_partition = 0; $read_only_archive = false; - if($num_partitions == NULL) { + if($filter_size == -1) { $read_only_archive = true; } @@ -151,16 +154,18 @@ class WebArchiveBundle file_get_contents($this->dir_name."/description.txt")); } - $this->num_partitions = $num_partitions; - if(isset($info['NUM_PARTITIONS'])) { - $this->num_partitions = $info['NUM_PARTITIONS']; + if(isset($info['NUM_DOCS_PER_PARTITION'])) { + $this->num_docs_per_partition = $info['NUM_DOCS_PER_PARTITION']; } $this->count = 0; if(isset($info['COUNT'])) { $this->count = $info['COUNT']; } - + + if(isset($info['WRITE_PARTITION'])) { + $this->write_partition = $info['WRITE_PARTITION']; + } if(isset($info['DESCRIPTION']) ) { $this->description = $info['DESCRIPTION']; } else { @@ -171,8 +176,9 @@ class WebArchiveBundle } $info['DESCRIPTION'] = $this->description; - $info['NUM_PARTITIONS'] = $this->num_partitions; + $info['NUM_DOCS_PER_PARTITION'] = $this->num_docs_per_partition; $info['COUNT'] = $this->count; + $info['WRITE_PARTITION'] = $this->write_partition; if(!$read_only_archive) { file_put_contents( $this->dir_name."/description.txt", serialize($info)); @@ -191,80 +197,61 @@ class WebArchiveBundle /** * Add the array of $pages to the WebArchiveBundle pages being stored in - * the partition according to the $key_field and the field used to store + * the partition according to write partition and the field used to store * the resulting offsets given by $offset_field. * - * @param string $key_field field used to select partition * @param string $offset_field field used to record offsets after storing * @param array &$pages data to store - * @return array $pages adjusted with offset field + * @return int the write_partition the pages were stored in */ - function addPages($key_field, $offset_field, &$pages) + function addPages($offset_field, &$pages) { - $partition_queue = array(); - for($i = 0; $i < $this->num_partitions; $i++) { - $partition_queue[$i] = array(); - } $num_pages = count($pages); - for($i = 0; $i < $num_pages; $i++) { - //we are doing this to preserve the order of the returned array - $pages[$i]['TMP_INDEX'] = $i; - } - - foreach($pages as $page) { - if(isset($page[$key_field])) { - $this->count++; - $index = WebArchiveBundle::selectPartition( - $page[$key_field], $this->num_partitions); - - $partition_queue[$index][] = $page; - } + if($this->num_docs_per_partition > 0 && + $num_pages > $this->num_docs_per_partition) { + crawlLog("ERROR! At most ".$this->num_docs_per_partition. + "many pages can be added in one go!"); + exit(); } - $pages_with_offsets = array(); - for($i = 0; $i < $this->num_partitions; $i++) { - $pages_with_offsets = array_merge($pages_with_offsets, - $this->addObjectsPartition( - $offset_field, $i, $partition_queue[$i])); + $partition = $this->getPartition($this->write_partition); + $part_count = $partition->count; + if($this->num_docs_per_partition > 0 && + $num_pages + $part_count > $this->num_docs_per_partition) { + $this->setWritePartition($this->writePartition + 1); + $partition = $this->getPartition($this->write_partition); } - foreach($pages_with_offsets as $off_page) { - $pages[$off_page['TMP_INDEX']][$offset_field] = - $off_page[$offset_field]; - unset($pages[$off_page['TMP_INDEX']]['TMP_INDEX'] ); - } - return $pages; + $this->addCount($num_pages); //only adds to count on disk + $this->count += $num_pages; + + $partition->addObjects($offset_field, $pages, NULL, NULL, false); + + return $this->write_partition; } /** - * Gets the page out of the WebArchiveBundle with the given key and offset - * - * The $key determines the partition WebArchive, the $offset give the - * byte offset within that archive. - * @param string $key hash to use to look up WebArchive partition - * @param int $offset byte offset in partition of desired page - * @return array desired page + * Advances the index of the write partition by one and creates the + * corresponding web archive. */ - function getPage($key, $offset) + function setWritePartition($i) { - $partition = - WebArchiveBundle::selectPartition($key, $this->num_partitions); - - return $this->getPageByPartition($partition, $offset); + $this->write_partition = $i; + $this->getPartition($this->write_partition); } /** * Gets a page using in WebArchive $partition using the provided byte * $offset and using existing $file_handle if possible. * - * @param int $partition which WebArchive to look in * @param int $offset byte offset of page data + * @param int $partition which WebArchive to look in * @param resource $file_handle file handle resource of $partition archive * @return array desired page */ - function getPageByPartition($partition, $offset, $file_handle = NULL) + function getPage($offset, $partition, $file_handle = NULL) { $page_array = $this->getPartition($partition)->getObjects( @@ -294,51 +281,6 @@ class WebArchiveBundle } } - /** - * Adds a list of objects to a given WebArchive partition - * - * @param string $offset_field field used to store offsets after the - * addition - * @param int $partition WebArchive index to store data into - * @param array &$objects objects to store - * @param array $data info header data to write - * @param string $callback function name of function to call as each - * object is stored. Can be used to save offset into $data - * @param bool $return_flag whether to return modified $objects or not - * @return mixed adjusted objects or void - */ - function addObjectsPartition($offset_field, $partition, - &$objects, $data = NULL, $callback = NULL, $return_flag = true) - { - $num_objects = count($objects); - $this->addCount($num_objects); - - return $this->getPartition($partition)->addObjects( - $offset_field, $objects, $data, $callback, $return_flag); - } - - /** - * Reads the info block of $partition WebArchive - * - * @param int $partition WebArchive to read from - * @return array data in its info block - */ - function readPartitionInfoBlock($partition) - { - return $this->getPartition($partition)->readInfoBlock(); - } - - /** - * Write $data into the info block of the $partition WebArchive - * - * @param int $partition WebArchive to write into - * @param array $data what to write - */ - function writePartitionInfoBlock($partition, &$data) - { - $this->getPartition($partition)->writeInfoBlock(NULL, $data); - } - /** * Looks at the $key_field key of elements of pages and computes an array * consisting of $key_field values which are not in @@ -387,6 +329,8 @@ class WebArchiveBundle } } + + /** * Gets an object encapsulating the $index the WebArchive partition in * this bundle. @@ -412,7 +356,6 @@ class WebArchiveBundle chmod($this->dir_name."/web_archive_".$index, 0777); } } - return $this->partition[$index]; } @@ -466,7 +409,7 @@ class WebArchiveBundle $info['DESCRIPTION'] = "Archive does not exist OR Archive description file not found"; $info['COUNT'] = 0; - $info['NUM_PARTITIONS'] = 0; + $info['NUM_DOCS_PER_PARTITION'] = -1; return $info; } @@ -476,26 +419,5 @@ class WebArchiveBundle } - /** - * Hashes $value to a WebArchive partition it should be read/written to, - * if a bundle has $num_partitions partitions. - * - * @param string $value item to hash - * @param int $num_partitions number of partitions - * @return int which partition $value should be written to/read from - */ - static function selectPartition($value, $num_partitions) - { - - $hash = substr(md5($value, true), 0, 4); - $int_array = unpack("N", $hash); - $seed = $int_array[1]; - - mt_srand($seed); - $index = mt_rand(0, $num_partitions - 1); - - return $index; - - } } ?> diff --git a/models/crawl_model.php b/models/crawl_model.php index 8eedcf2e2..f01cec47f 100755 --- a/models/crawl_model.php +++ b/models/crawl_model.php @@ -75,21 +75,22 @@ class CrawlModel extends Model implements CrawlConstants /** - * Get a summary of a document by it document id (a string hash value) - * and its offset + * Get a summary of a document by the generation it is in + * and its offset into the corresponding WebArchive. * - * @param string $ukey document id hash string - * @param int $summary_offset offset into a partition in a WebArchiveBundle + * @param int $summary_offset offset in $generation WebArchive + * @param int $generation the index of the WebArchive in the + * IndexArchiveBundle to find the item in. * @return array summary data of the matching document */ - function getCrawlItem($ukey, $summary_offset) + function getCrawlItem($summary_offset, $generation) { $index_archive_name = self::index_data_base_name . $this->index_name; $index_archive = new IndexArchiveBundle(CRAWL_DIR.'/cache/'.$index_archive_name); - $summary = $index_archive->getPage($ukey, $summary_offset); + $summary = $index_archive->getPage($summary_offset, $generation); return $summary; } @@ -108,27 +109,29 @@ class CrawlModel extends Model implements CrawlConstants * cached page lives on * @param string $machine_uri the path from document root on $machine where * the yioop scripts live - * @param string $hash the hash that was used to represent the page in the - * WebArchiveBundle + * @param int $partition the partition in the WebArchiveBundle the page is + * in * @param int $offset the offset in bytes into the WebArchive partition in * the WebArchiveBundle at which the cached page lives. * @param string $crawl_time the timestamp of the crawl the cache page is * from * @return array page data of the cached page */ - function getCacheFile($machine, $machine_uri, $hash, $offset, $crawl_time) + function getCacheFile($machine, $machine_uri, $partition, + $offset, $crawl_time) { $time = time(); $session = md5($time . AUTH_KEY); - if($machine == '::1') { - $machine = "localhost"; + if($machine == '::1') { //IPv6 :( + $machine = "[::1]/"; //used if the fetching and queue serving were on the same machine } $request= "http://$machine$machine_uri?c=archive&a=cache&time=$time". - "&session=$session&hash=$hash&offset=$offset". + "&session=$session&partition=$partition&offset=$offset". "&crawl_time=$crawl_time"; - $page = @unserialize(base64_decode(FetchUrl::getPage($request))); + $tmp = FetchUrl::getPage($request); + $page = @unserialize(base64_decode($tmp)); $page['REQUEST'] = $request; return $page; @@ -196,7 +199,9 @@ class CrawlModel extends Model implements CrawlConstants isset($info['VISITED_URLS_COUNT']) ? $info['VISITED_URLS_COUNT'] : 0; $crawl['COUNT'] = $info['COUNT']; - $crawl['NUM_PARTITIONS'] = $info['NUM_PARTITIONS']; + $crawl['NUM_DOCS_PER_PARTITION'] = + $info['NUM_DOCS_PER_PARTITION']; + $crawl['WRITE_PARTITION'] = $info['WRITE_PARTITION']; $list[] = $crawl; } } diff --git a/models/phrase_model.php b/models/phrase_model.php index c822caba3..151aef13c 100755 --- a/models/phrase_model.php +++ b/models/phrase_model.php @@ -139,33 +139,38 @@ class PhraseModel extends Model /** * Determines the offset into the summaries WebArchiveBundle of the - * provided url so that it can be retrieved. This relies on the - * fact that the info:url meta word has been stored. + * provided url so that the info:url summary can be retrieved. + * This assumes of course that the info:url meta word has been stored. * * @param string $url what to lookup - * @return int offset into the web archive bundle + * @return array (offset, generation) into the web archive bundle */ - function lookupSummaryOffset($url) + function lookupSummaryOffsetGeneration($url) { $index_archive_name = self::index_data_base_name . $this->index_name; $index_archive = new IndexArchiveBundle( CRAWL_DIR.'/cache/'.$index_archive_name); - $word_iterator = - new WordIterator(crawlHash("info:$url"), $index_archive); $num_retrieved = 0; $pages = array(); $summary_offset = NULL; - while(is_array($next_docs = $word_iterator->nextDocsWithWord()) && - $num_retrieved < 1) { - foreach($next_docs as $doc_key => $doc_info) { - $summary_offset = & $doc_info[CrawlConstants::SUMMARY_OFFSET]; - $num_retrieved++; - if($num_retrieved >= 1) { - break 2; + $num_generations = $index_archive->generation_info['ACTIVE']; + for($i = 0; $i <= $num_generations && $num_retrieved < 1; $i++) { + $index_archive->setCurrentShard($i); + $word_iterator = + new WordIterator(crawlHash("info:$url"), $index_archive); + while(is_array($next_docs = $word_iterator->nextDocsWithWord()) && + $num_retrieved < 1) { + foreach($next_docs as $doc_key => $doc_info) { + $summary_offset = & + $doc_info[CrawlConstants::SUMMARY_OFFSET]; + $num_retrieved++; + if($num_retrieved >= 1) { + break 3; + } } - } + } } - return $summary_offset; + return array($summary_offset, $i); } /** @@ -419,6 +424,7 @@ class PhraseModel extends Model if($generation > $index_archive->generation_info['ACTIVE']) { continue; } + $index_archive->setCurrentShard($generation); $weight = $word_struct["WEIGHT"]; $num_word_keys = count($word_keys); diff --git a/tests/bst_array_test.php b/tests/bst_array_test.php deleted file mode 100644 index 5c545e4d2..000000000 --- a/tests/bst_array_test.php +++ /dev/null @@ -1,103 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009, 2010 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Chris Pollett chris@pollett.org - * @package seek_quarry - * @subpackage test - * @license http://www.gnu.org/licenses/ GPL3 - * @link http://www.seekquarry.com/ - * @copyright 2009, 2010 - * @filesource - */ - -if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} - -/** - * Load the string_array library we'll be testing - */ -require_once BASE_DIR."/lib/bst_array.php"; - -/** - * Used to test that the BSTArray class properly stores/retrieves values, - * - * @author Chris Pollett - * @package seek_quarry - * @subpackage test - */ -class BSTArrayTest extends UnitTest -{ - /** - * We'll use two different tables one more representative of how the table - * is going to be used by the web_queue_bundle, the other small enough that - * we can manually figure out what the result should be - */ - public function setUp() - { - $this->test_objects['BST'] = new BSTArray(1, 1, "strcmp"); - } - - /** - */ - public function tearDown() - { - unset($this->test_objects['BST']); - } - - /** - * Check if can put objects into BST array and retrieve them - */ - public function insertTestCase() - { - $this->test_objects['BST']->insertUpdate(chr(65), chr(66)); - $flag = $this->test_objects['BST']->contains(chr(65), $offset, $parent); - $this->assertTrue($flag, "BST contains what was just inserted"); - $this->test_objects['BST']->insertUpdate(chr(67), chr(68)); - $flag = $this->test_objects['BST']->contains(chr(67), $offset, $parent); - $this->assertTrue($flag, "BST contains second insert"); - $this->test_objects['BST']->insertUpdate(chr(66), chr(69)); - $flag = $this->test_objects['BST']->contains(chr(66), $offset, $parent); - $this->assertTrue($flag, "BST contains third insert"); - $this->test_objects['BST']->insertUpdate(chr(69), chr(69)); - $flag = $this->test_objects['BST']->contains(chr(69), $offset, $parent); - $this->assertTrue($flag, "BST contains fourth insert"); - } - - /** - * Check if can modify objects in BST array - */ - public function updateTestCase() - { - $this->test_objects['BST']->insertUpdate(chr(65), chr(66)); - $this->test_objects['BST']->insertUpdate(chr(67), chr(68)); - $this->test_objects['BST']->insertUpdate(chr(66), chr(69)); - $this->test_objects['BST']->insertUpdate(chr(69), chr(69)); - $this->test_objects['BST']->insertUpdate(chr(66), chr(66)); - $this->test_objects['BST']->contains(chr(66), $offset, $parent); - list($key, $value, $left, $right) = $this->test_objects['BST']-> - readEntry($offset); - $this->assertEqual($value, chr(66), "BST contains fourth insert"); - } - -} -?> diff --git a/views/search_view.php b/views/search_view.php index b0f5b1472..aaa5f6f4f 100755 --- a/views/search_view.php +++ b/views/search_view.php @@ -113,7 +113,7 @@ class SearchView extends View implements CrawlConstants $data['TOTAL_ROWS'])); ?> )</h2> <?php - foreach($data['PAGES'] as $page) {?> + foreach($data['PAGES'] as $page) {?> <div class='result'> <h2> <a href="<?php if($page[self::TYPE] != "link") { @@ -148,6 +148,7 @@ class SearchView extends View implements CrawlConstants e($data['QUERY']); ?>&arg=<?php e(urlencode($page[self::URL])); ?>&so=<?php e($page[self::SUMMARY_OFFSET]); + ?>&g=<?php e($page[self::CACHE_PAGE_PARTITION]); ?>&its=<?php e($data['its']); ?>" > <?php if($page[self::TYPE] == "text/html" ||