Fixes Bugs 5, 8, 9, a=cpollett

Chris Pollett [2010-09-02 15:Sep:nd]

Fixes Bugs 5, 8, 9, a=cpollett

Filename
bin/fetcher.php
bin/queue_server.php
configs/config.php
controllers/search_controller.php
lib/crawl_constants.php
lib/index_archive_bundle.php
lib/index_bundle_iterators/group_iterator.php
lib/index_bundle_iterators/index_bundle_iterator.php
lib/index_bundle_iterators/intersect_iterator.php
lib/index_bundle_iterators/phrase_filter_iterator.php
lib/index_bundle_iterators/union_iterator.php
lib/index_bundle_iterators/word_iterator.php
lib/persistent_structure.php
lib/phrase_parser.php
lib/processors/html_processor.php
lib/processors/image_processor.php
lib/url_parser.php
lib/utility.php
lib/web_archive_bundle.php
locale/en-US/configure.ini
locale/en-US/statistics.txt
locale/fr-FR/configure.ini
locale/he/configure.ini
locale/in-ID/configure.ini
locale/ja/configure.ini
locale/ja/statistics.txt
locale/ko/configure.ini
locale/ko/statistics.txt
locale/rn-US/configure.ini
locale/th/configure.ini
locale/vi-VN/configure.ini
locale/vi-VN/statistics.txt
locale/vn-US/configure.ini
locale/zh-CN/configure.ini
locale/zh-CN/statistics.txt
models/crawl_model.php
models/phrase_model.php
views/crawlstatus_view.php
views/search_view.php

diff --git a/bin/fetcher.php b/bin/fetcher.php
index b72cfbdcf..59a209e65 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -155,6 +155,12 @@ class Fetcher implements CrawlConstants
      * @var array
      */
     var $found_sites;
+    /**
+     * Urls of duplicate sites that the fetcher hasn't sent to
+     * the queue_server yet
+     * @var array
+     */
+    var $found_duplicates;
     /**
      * Timestamp from the queue_server of the current schedule of sites to
      * download. This is sent back to the server once this schedule is completed
@@ -217,6 +223,7 @@ class Fetcher implements CrawlConstants
         $this->to_crawl = array();
         $this->to_crawl_again = array();
         $this->found_sites = array();
+        $this->found_duplicates = array();

         $this->sum_seen_title_length = 0;
         $this->sum_seen_description_length = 0;
@@ -320,8 +327,11 @@ class Fetcher implements CrawlConstants

             $site_pages = FetchUrl::getPages($sites, true);

-            list($deduplicated_pages, $schedule_again_pages) =
+            list($deduplicated_pages, $schedule_again_pages, $duplicates) =
                 $this->deduplicateAndReschedulePages($site_pages);
+
+            $this->found_duplicates = array_merge($this->found_duplicates,
+                $duplicates);
             if($can_schedule_again == true) {
                 foreach($schedule_again_pages as $schedule_again_page) {
                     if($schedule_again_page[self::CRAWL_DELAY] == 0) {
@@ -535,10 +545,12 @@ class Fetcher implements CrawlConstants
      * Does page deduplication on an array of downloaded pages using a
      * BloomFilterBundle of $this->web_archive. Deduplication based
      * on summaries is also done on the queue server. Also, sorts out pages
-     * for which no content was downloaded so that they cna be scheduled
+     * for which no content was downloaded so that they can be scheduled
      * to be crawled again.
      *
      * @param array &$site_pages pages to deduplicate
+     * @return an array conisting of the deduclicated pages, the not_downloaded
+     *      sites, and the urls of duplicate pages.
      */
     function deduplicateAndReschedulePages(&$site_pages)
     {
@@ -546,7 +558,8 @@ class Fetcher implements CrawlConstants

         $deduplicated_pages = array();
         $not_downloaded = array();
-
+        $duplicates = array();
+
         $unseen_page_hashes =
             $this->web_archive->differencePageKeysFilter($site_pages,
             self::HASH);
@@ -560,13 +573,15 @@ class Fetcher implements CrawlConstants
                 $deduplicated_pages[] = $site;
             } else if(!isset($site[self::HASH])){
                 $not_downloaded[] = $site;
+            } else {
+                $duplicates[] = $site[self::URL];
             }

         }
         crawlLog("  Delete duplicated pages time".
             (changeInMicrotime($start_time)));

-        return array($deduplicated_pages, $not_downloaded);
+        return array($deduplicated_pages, $not_downloaded, $duplicates);
     }

     /**
@@ -946,8 +961,11 @@ class Fetcher implements CrawlConstants
         $average_total_link_text_length =
             $doc_statistics[self::AVERAGE_TOTAL_LINK_TEXT_LENGTH];

-        foreach($doc_statistics as $doc_key => $info) {
+        $special_case_fields = array(self::INLINKS, self::SITE_INFO,
+            self::FILETYPE, self::URL_INFO);

+        foreach($doc_statistics as $doc_key => $info) {
+            if(in_array($doc_key, $special_case_fields)) {continue;}
             $title_length = $info[self::TITLE_LENGTH];
             $description_length = $info[self::DESCRIPTION_LENGTH];
             $link_length = $info[self::LINK_LENGTH];
@@ -1034,7 +1052,7 @@ class Fetcher implements CrawlConstants
                     number_format($doc_rank, PRECISION); //proxy for page rank

                 $orphan = (isset($info[self::LINK_WORDS]) &&
-                    count($info[self::LINK_WORDS]) > 0 ) ? 1 : .5;
+                    $info[self::LINK_WORDS] == true) ? 1 : .5;

                 $words[$word_key][$doc_key][self::SCORE] = number_format(
                     .8*($doc_rank)
@@ -1044,10 +1062,12 @@ class Fetcher implements CrawlConstants

             }
         }
-
-        if(STORE_INLINKS_IN_DICTIONARY &&
-            isset($doc_statistics[self::INLINKS])) {
-            foreach($doc_statistics[self::INLINKS]
+
+
+        //add word_keys for inlink, sites, filetype
+        foreach($special_case_fields as $special_case_field) {
+            if(isset($doc_statistics[$special_case_field])) {
+            foreach($doc_statistics[$special_case_field]
                 as $url_word_key => $docs_info) {
                 foreach($docs_info as $doc_key) {
                     $doc_depth = $doc_statistics[$doc_key][self::DOC_DEPTH] + 1;
@@ -1060,8 +1080,20 @@ class Fetcher implements CrawlConstants
                     $words[$url_word_key][$doc_key][self::SCORE] =
                         number_format(11 - $doc_depth, PRECISION);
                 }
+
             }
+            }
+        }
+        foreach($this->found_duplicates as $duplicate) {
+            $doc_key = crawlHash($duplicate);
+            $url_word_key = crawlHash("info:".$duplicate);
+            $words[$url_word_key][$doc_key][self::TITLE_WORD_SCORE] = -1;
+            $words[$url_word_key][$doc_key][self::DESCRIPTION_WORD_SCORE] = -1;
+            $words[$url_word_key][$doc_key][self::LINK_WORD_SCORE] = -1;
+            $words[$url_word_key][$doc_key][self::DOC_RANK] = -1;
+            $words[$url_word_key][$doc_key][self::SCORE] = -1;
         }
+        $this->found_duplicates = array();

         $this->found_sites[self::INVERTED_INDEX] = $words;

@@ -1080,13 +1112,16 @@ class Fetcher implements CrawlConstants
     function computeDocumentStatistics()
     {
         $doc_statistics = array();
-        $this->num_seen_sites += count($this->found_sites[self::SEEN_URLS]);
-        foreach($this->found_sites[self::SEEN_URLS] as $site) {
+        $num_seen = count($this->found_sites[self::SEEN_URLS]);
+        $this->num_seen_sites += $num_seen;
+        for($i = 0; $i < $num_seen; $i++) {
+            $site = $this->found_sites[self::SEEN_URLS][$i];
             $doc_key = crawlHash($site[self::URL]);

             $doc_statistics[$doc_key][self::URL_WEIGHT] =
                 3 - log(strlen($site[self::URL])); //negative except short urls
-
+            $doc_statistics[$doc_key][self::DOC_DEPTH] =
+                log($site[self::INDEX]*NUM_FETCHERS, 10);
             $title_phrase_string =
                 mb_ereg_replace("[[:punct:]]", " ", $site[self::TITLE]);
             $doc_statistics[$doc_key][self::TITLE_WORDS] =
@@ -1107,28 +1142,90 @@ class Fetcher implements CrawlConstants
                     $doc_statistics[$doc_key][self::DESCRIPTION_WORDS]);
             $this->sum_seen_site_description_length +=
                 $doc_statistics[$doc_key][self::DESCRIPTION_LENGTH];
-
+            $doc_statistics[$doc_key][self::LINK_WORDS] = array();
+            $doc_statistics[$doc_key][self::LINK_LENGTH] = 0;
+            // store the sites the doc_key belongs to, so you can search by site
+            $url_sites = UrlParser::getHostPaths($site[self::URL]);
+            $url_sites = array_merge($url_sites,
+                UrlParser::getHostSubdomains($site[self::URL]));
+            foreach($url_sites as $url_site) {
+                if(strlen($url_site) > 0) {
+                    $doc_statistics[self::SITE_INFO][
+                        crawlHash('site:'.$url_site)][] = $doc_key;
+                }
+            }
+            $doc_statistics[self::URL_INFO][
+                crawlHash('info:'.$site[self::URL])][] = $doc_key;
+
+            // store the filetype info
+            $url_type = UrlParser::getDocumentType($site[self::URL]);
+            if(strlen($url_type) > 0) {
+                $doc_statistics[self::FILETYPE][
+                    crawlHash('filetype:'.$url_type)][] = $doc_key;
+            }
+
             $link_phrase_string = "";
             $link_urls = array();
+            //store inlinks so they can be searched by
+            $num_links = count($site[self::LINKS]);
+            if($num_links > 0) {
+                $link_weight = $site[self::WEIGHT]/$num_links;
+            } else {
+                $link_weight = 0;
+            }
+            $had_links = false;
+
             foreach($site[self::LINKS] as $url => $link_text) {
-                $link_phrase_string .= " $link_text";
-                if(STORE_INLINKS_IN_DICTIONARY) {
-                    $doc_statistics[self::INLINKS][crawlHash($url)][] =$doc_key;
+                if(strlen($url) > 0) {
+                    $summary = array();
+                    $had_links = true;
+                    $link_text = strip_tags($link_text);
+                    $link_id =
+                        "url|".$url."|text|$link_text|ref|".$site[self::URL];
+                    $link_key =  crawlHash($link_id).":".crawlHash($url).":"
+                        .crawlHash("info:".$url);
+                    $summary[self::URL] =  $link_id;
+                    $summary[self::TITLE] = $url;
+                        // stripping html to be on the safe side
+                    $summary[self::DESCRIPTION] =  $link_text;
+                    $summary[self::TIMESTAMP] =  $site[self::TIMESTAMP];
+                    $summary[self::ENCODING] = $site[self::ENCODING];
+                    $summary[self::HASH] =  crawlHash($link_id);
+                    $summary[self::TYPE] = "link";
+                    $summary[self::HTTP_CODE] = "link";
+                    $summary[self::WEIGHT] =  $link_weight;
+                    $this->found_sites[self::SEEN_URLS][] = $summary;
+
+                    $doc_statistics[$link_key][self::URL_WEIGHT] =
+                        3 - log(strlen($url));
+                        //negative except short urls
+                    $doc_statistics[$link_key][self::TITLE_WORDS]  =array();
+                    $doc_statistics[$link_key][self::TITLE_LENGTH] = 0;
+                    $doc_statistics[$link_key][self::DESCRIPTION_WORDS] =
+                        array();
+                    $doc_statistics[$link_key][self::DESCRIPTION_LENGTH] = 0;
+
+                    $link_text =
+                        mb_ereg_replace("[[:punct:]]", " ", $link_text);
+                    $doc_statistics[$link_key][self::LINK_WORDS] =
+                        PhraseParser::extractPhrasesAndCount($link_text);
+                    $doc_statistics[$link_key][self::LINK_LENGTH] =
+                        $this->sumCountArray(
+                            $doc_statistics[$link_key][self::LINK_WORDS]);
+                    $this->sum_seen_site_link_length +=
+                        $doc_statistics[$link_key][self::LINK_LENGTH];
+
+                    $doc_statistics[$link_key][self::DOC_DEPTH] =
+                        log(10*$site[self::INDEX]*NUM_FETCHERS, 10);
+                        //our proxy for page rank, 10=average links/page
+                    $doc_statistics[self::INLINKS][crawlHash('link:'.$url)][] =
+                        $doc_key;
                 }
+                $this->found_sites[self::SEEN_URLS][$i][self::LINKS] =
+                    $had_links;
             }
-            $link_phrase_string =
-                mb_ereg_replace("[[:punct:]]", " ", $link_phrase_string);
-            $doc_statistics[$doc_key][self::LINK_WORDS] =
-                PhraseParser::extractPhrasesAndCount($link_phrase_string);
-            $doc_statistics[$doc_key][self::LINK_LENGTH] =
-                $this->sumCountArray(
-                    $doc_statistics[$doc_key][self::LINK_WORDS]);
-            $this->sum_seen_site_link_length +=
-                $doc_statistics[$doc_key][self::LINK_LENGTH];

-            $doc_statistics[$doc_key][self::DOC_DEPTH] =
-                log($site[self::INDEX]*NUM_FETCHERS, 10);
-                //our proxy for page rank, 10=average links/page
+
         }

         $doc_statistics[self::AVERAGE_TITLE_LENGTH] =
diff --git a/bin/queue_server.php b/bin/queue_server.php
index 44d897579..b47b06dc4 100755
--- a/bin/queue_server.php
+++ b/bin/queue_server.php
@@ -503,19 +503,30 @@ class QueueServer implements CrawlConstants
             $num_seen = 0;
         }

-
+        $visited_urls_count = 0;
         for($i = 0; $i < $num_seen; $i++) {
             $index_archive->addPageFilter(self::HASH, $seen_sites[$i]);
             $seen_sites[$i][self::MACHINE] = $machine;
             $seen_sites[$i][self::MACHINE_URI] = $machine_uri;
             $seen_sites[$i][self::HASH_URL] =
                 crawlHash($seen_sites[$i][self::URL]);
+            $link_url_parts = explode("|", $seen_sites[$i][self::URL]);
+            if(strcmp("url", $link_url_parts[0]) == 0 &&
+                strcmp("text", $link_url_parts[2]) == 0) {
+                $seen_sites[$i][self::HASH_URL] =
+                    crawlHash($seen_sites[$i][self::URL]).
+                    ":".crawlHash($link_url_parts[1]).
+                    ":".crawlHash("info:".$link_url_parts[1]);
+            } else {
+                $visited_urls_count++;
+            }
         }

         if(isset($seen_sites)) {
             $seen_sites =
                 $index_archive->addPages(
-                    self::HASH_URL, self::SUMMARY_OFFSET, $seen_sites);
+                    self::HASH_URL, self::SUMMARY_OFFSET, $seen_sites,
+                    $visited_urls_count);

             $summary_offsets = array();
             foreach($seen_sites as $site) {
@@ -727,13 +738,15 @@ class QueueServer implements CrawlConstants
                         "Removing $url from Queue (shouldn't still be there!)");
                     $this->web_queue->removeQueue($url);
                 }
-
-                array_push($most_recent_urls, $url);
-                if($cnt >= NUM_RECENT_URLS_TO_DISPLAY)
-                {
-                    array_shift($most_recent_urls);
+                if(strpos($url, "url|") !== 0) {
+                    array_push($most_recent_urls, $url);
+                    if($cnt >= NUM_RECENT_URLS_TO_DISPLAY)
+                    {
+                        array_shift($most_recent_urls);
+                    }
+                    $cnt++;
                 }
-                $cnt++;
+
             }
         }

@@ -809,6 +822,7 @@ class QueueServer implements CrawlConstants
         $info_bundle = IndexArchiveBundle::getArchiveInfo(
             CRAWL_DIR.'/cache/'.self::index_data_base_name.$this->crawl_time);
         $crawl_status['COUNT'] = $info_bundle['COUNT'];
+        $crawl_status['VISITED_URLS_COUNT'] = $info_bundle['VISITED_URLS_COUNT'];
         $crawl_status['DESCRIPTION'] = $info_bundle['DESCRIPTION'];
         file_put_contents(
             CRAWL_DIR."/schedules/crawl_status.txt", serialize($crawl_status));
@@ -818,7 +832,9 @@ class QueueServer implements CrawlConstants

         crawlLog(
             "The current crawl description is: ".$info_bundle['DESCRIPTION']);
-        crawlLog("Total seen urls so far: ".$info_bundle['COUNT']);
+        crawlLog("Number of unique pages so far: ".
+            $info_bundle['VISITED_URLS_COUNT']);
+        crawlLog("Total urls extracted so far: ".$info_bundle['COUNT']);
         crawlLog("Of these, the most recent urls are:");
         foreach($most_recent_urls as $url) {
             crawlLog("URL: $url");
diff --git a/configs/config.php b/configs/config.php
index 02fc6f00b..6337fe76c 100755
--- a/configs/config.php
+++ b/configs/config.php
@@ -138,7 +138,7 @@ define('MAX_WAITING_HOSTS', 1000);
 define('URL_FILTER_SIZE', 10000000);

 /** number of fetchers that will be used in a given crawl */
-define('NUM_FETCHERS', 3);
+define('NUM_FETCHERS', 4);

 /**
  * maximum number of urls that will be held in ram
diff --git a/controllers/search_controller.php b/controllers/search_controller.php
index 2003b37e6..ceddfb63d 100755
--- a/controllers/search_controller.php
+++ b/controllers/search_controller.php
@@ -138,7 +138,8 @@ class SearchController extends Controller implements CrawlConstants
         } else {
             $index_time_stamp = 0; //use the default crawl index
         }
-        if(isset($_REQUEST['q']) || $activity != "query") {
+        if(isset($_REQUEST['q']) && strlen($_REQUEST['q']) >0
+            || $activity != "query") {
             if($activity == "query") {
                 $activity_array = $this->extractActivityQuery();
                 $query = $activity_array[0]; // dirty
@@ -224,7 +225,7 @@ class SearchController extends Controller implements CrawlConstants
                     crawlHash($url), $summary_offset);

                 $top_phrases  =
-                    $this->phraseModel->getTopPhrases($crawl_item, 20);
+                    $this->phraseModel->getTopPhrases($crawl_item, 3);
                 $top_query = implode(" ", $top_phrases);
                 $phrase_results = $this->phraseModel->getPhrasePageResults(
                     $top_query, $limit, $results_per_page, false);
@@ -244,8 +245,10 @@ class SearchController extends Controller implements CrawlConstants
             break;
         }

-        $data['PAGES'] = $phrase_results['PAGES'];
-        $data['TOTAL_ROWS'] = $phrase_results['TOTAL_ROWS'];
+        $data['PAGES'] = (isset($phrase_results['PAGES'])) ?
+             $phrase_results['PAGES']: array();
+        $data['TOTAL_ROWS'] = (isset($phrase_results['TOTAL_ROWS'])) ?
+            $phrase_results['TOTAL_ROWS'] : 0;
         $data['LIMIT'] = $limit;
         $data['RESULTS_PER_PAGE'] = $results_per_page;

@@ -332,11 +335,20 @@ class SearchController extends Controller implements CrawlConstants
             $machine_uri, $page, $offset, $crawl_time);

         $cache_file = $cache_item[self::PAGE];
+
         $request = $cache_item['REQUEST'];

+        $meta_words = array('link\:', 'site\:',
+            'filetype\:', 'info\:', '\-',
+            'index:', 'i:', 'weight:', 'w:');
+        foreach($meta_words as $meta_word) {
+            $pattern = "/(\s)($meta_word(\S)+)/";
+            $query = preg_replace($pattern, "", $query);
+        }
         $query = str_replace("'", " ", $query);
         $query = str_replace('"', " ", $query);
         $query = str_replace('\\', " ", $query);
+        $query = str_replace('|', " ", $query);
         $query = $this->clean($query, "string");

         $page_url = $url;
diff --git a/lib/crawl_constants.php b/lib/crawl_constants.php
index da45cb25a..0a2dcf79e 100644
--- a/lib/crawl_constants.php
+++ b/lib/crawl_constants.php
@@ -124,5 +124,9 @@ interface CrawlConstants
     const PAGE_IMPORTANCE = 'ad';

     const MACHINE_URI = 'ae';
+    const SITE_INFO = 'af';
+    const FILETYPE = 'ag';
+    const SUMMARY = 'ah';
+    const URL_INFO = 'ai';
 }
 ?>
diff --git a/lib/index_archive_bundle.php b/lib/index_archive_bundle.php
index aeefd4967..f048e4128 100644
--- a/lib/index_archive_bundle.php
+++ b/lib/index_archive_bundle.php
@@ -54,28 +54,10 @@ require_once 'utility.php';
  */
 require_once 'crawl_constants.php';

-/**
- * Enumerative interface for common constants between WordIterator and
- * IndexArchiveBundle
- *
- * These constants are used as fields in arrays. They are negative to
- * distinguish them from normal array elements 0, 1, 2... However, this
- * means you need to be slightly careful if you try to sort the array
- * as this might screw things up
- *
- * @author Chris Pollett
- * @package seek_quarry
- * @subpackage library
+/**
+ *Loads common constants for word indexing
  */
-interface IndexingConstants
-{
-    const COUNT = -1;
-    const END_BLOCK = -2;
-    const LIST_OFFSET = -3;
-    const POINT_BLOCK = -4;
-    const PARTIAL_COUNT = -5;
-    const NAME = -6;
-}
+require_once 'indexing_constants.php';


 /**
@@ -116,427 +98,7 @@ function setOffsetPointers($data, &$objects, $offset_field)
     return $data;
 }

-/**
- * Used to iterate through the documents associated with a word in
- * an IndexArchiveBundle. It also makes it easy to get the summaries
- * of these documents and restrict the documents by additional words.
- *
- * A description of how words and the documents containing them are stored
- * is given in the documentation of IndexArchiveBundle. To iterate over
- * all documents containng a word, its hash, work_key, is formed. Then using
- * the Bloom filter for that partition, it is determined if the word is stored
- * at all, and if it is, which generations it occurs in. Then the iterator
- * is set to point to the first block of the first generation the word appears
- * in that is greater than the limit of the WordIterator. Thereafter,
- * nextDocsWithWord will advance $this->current_pointer by one per call.
- * $this->current_pointer keeps track of which block of documents containing
- * the word to return. If it is less than COMMON_WORD_THRESHOLD/BLOCK_SIZE and
- * there are still more blocks, then the corresponding block_pointer of the word
- * from the generation's partition info_block is used to look up the offset to
- * the doc block. If it is greater than this value then the linked list
- * of doc blocks pointed to for the partition is followed to get the appropriate
- * block. This list is in the order that words were stored in the index so
- * LIST_OFFSET points to the last block stored, which in turn points to the
- * next to last block, etc. Finally, when all the blocks in the linked-list are
- * exhausted, the remaining docs for that generation for that word are stored
- * in the info block for the word itself (this will always be less than
- * BLOCK_SIZE many). Once all the docs for a word for a generation have been
- * iterated through, than iteration proceeds to the next generation containing
- * the word.
- *
- * @author Chris Pollett
- * @package seek_quarry
- * @subpackage library
- * @see IndexArchiveBundle
- */
-class WordIterator implements IndexingConstants, CrawlConstants
-{
-    /**
-     * hash of word that the iterator iterates over
-     * @var string
-     */
-    var $word_key;
-    /**
-     * The IndexArchiveBundle this index is associated with
-     * @var object
-     */
-    var $index;
-    /**
-     * The number of documents already iterated over
-     * @var int
-     */
-    var $seen_docs;
-    /**
-     * @var int
-     */
-    var $restricted_seen_docs;
-    /**
-     * The number of documents in the current block before filtering
-     * by restricted words
-     * @var int
-     */
-    var $count_block_unfiltered;
-    /**
-     * Estimate of the number of documents that this iterator can return
-     * @var int
-     */
-    var $num_docs;
-
-    /**
-     * If iterating through the linked-list portions of the documents
-     * the next byte offset in the WebArchive based linked-list
-     * @var int
-     */
-    var $next_offset;
-    /**
-     * Block number of the last block of docs
-     * @var int
-     */
-    var $last_pointed_block;
-    /**
-     * @var int
-     */
-    var $list_offset;
-
-    /**
-     * Pointers to offsets for blocks containing docs with the given word
-     * for the current generation
-     * @var array
-     */
-    var $block_pointers;
-    /**
-     * Number of completely full blocks of documents for the current generation
-     * @var int
-     */
-    var $num_full_blocks;
-    /**
-     * Number of generations word appears in
-     * @var int
-     */
-    var $num_generations;
-    /**
-     * Used to store the contents of the last partially full block
-     * @var int
-     */
-    var $last_block;
-    /**
-     *
-     * @var object
-     */
-    var $info_block;
-    /**
-     * Stores the number of the current block of documents we are at in the
-     * set of all blocks of BLOCK_SIZE many documents
-     * @var int
-     */
-    var $current_pointer;
-    /**
-     * First document that should be returned
-     * amongst all of the documents associated with the
-     * iterator's $word_key
-     * @var int
-     */
-    var $limit;
-
-    /**
-     * Creates a word iterator with the given parameters.
-     *
-     * @param string $word_key hash of word or phrase to iterate docs of
-     * @param object $index the IndexArchiveBundle to use
-     * @param int $limit the first element to return from the list of docs
-     *      iterated over
-     * @param object $info_block the info block of the WebArchive
-     *      associated with the word in the index. If NULL, then this will
-     *      loaded in WordIterator::reset()
-     */
-    public function __construct($word_key, $index, $limit = 0, $info_block = NULL)
-    {
-        $this->word_key = $word_key;
-        $this->index = $index;
-        $this->limit = $limit;
-        $this->reset($info_block);
-    }
-
-    /**
-     * Returns the iterators to the first document block that it could iterate
-     * over
-     *
-     * @param object $info_block the header block in the index WebArchiveBundle
-     *      for the word this iterator iterates over. If not NULL, this saves
-     *      the time to load it. If not it will be loaded, but this will be
-     *      slower.
-     */
-    public function reset($info_block = NULL)
-    {
-        $this->restricted_seen_docs = 0;
-        $this->count_block_unfiltered = 0;
-
-        $partition =
-            WebArchiveBundle::selectPartition($this->word_key,
-                $this->index->num_partitions_index);
-
-        if($info_block == NULL) {
-        	    $this->info_block =
-        	        $this->index->getPhraseIndexInfo($this->word_key);
-        } else {
-            $this->info_block = $info_block;
-        }
-        if($this->info_block !== NULL) {
-            $this->num_generations = count($this->info_block['GENERATIONS']);
-            $count_till_generation = $this->info_block[self::COUNT];
-
-            while($this->limit >= $count_till_generation) {
-                $this->info_block['CURRENT_GENERATION_INDEX']++;
-                if($this->num_generations <=
-                    $this->info_block['CURRENT_GENERATION_INDEX']) {
-                    $this->num_docs = 0;
-                    $this->current_pointer = -1;
-                    return;
-                }
-                $info_block = $this->index->getPhraseIndexInfo(
-                    $this->word_key,
-                    $this->info_block['CURRENT_GENERATION_INDEX'],
-                    $this->info_block);
-                if($info_block !== NULL) {
-                    $this->info_block = $info_block;
-                }
-                $count_till_generation += $this->info_block[self::COUNT];
-            }
-
-
-        }
-
-        $this->seen_docs = $count_till_generation -
-            $this->info_block[self::COUNT];
-        $this->initGeneration();
-
-
-    }
-
-    /**
-     * Sets up the iterator to iterate through the current generation.
-     *
-     * @return bool whether the initialization succeeds
-     */
-    public function initGeneration()
-    {
-
-        if($this->info_block !== NULL) {
-            $info_block = $this->index->getPhraseIndexInfo(
-                $this->word_key, $this->info_block['CURRENT_GENERATION_INDEX'],
-                $this->info_block);
-            if($info_block === NULL) {
-                return false;
-            }
-            $this->info_block = $info_block;
-            $this->num_docs = $info_block['TOTAL_COUNT'];
-            $this->num_docs_generation = $info_block[self::COUNT];
-
-            $this->current_pointer =
-                max(floor(($this->limit - $this->seen_docs) / BLOCK_SIZE), 0);
-            $this->seen_docs += $this->current_pointer*BLOCK_SIZE;
-            $this->last_block = $info_block[self::END_BLOCK];
-            $this->num_full_blocks =
-                floor($this->num_docs_generation / BLOCK_SIZE);
-            if($this->num_docs_generation > COMMON_WORD_THRESHOLD) {
-                $this->last_pointed_block =
-                    floor(COMMON_WORD_THRESHOLD / BLOCK_SIZE);
-            } else {
-                $this->last_pointed_block = $this->num_full_blocks;
-            }
-
-            for($i = 0; $i < $this->last_pointed_block; $i++) {
-                if(isset($info_block[$i])) {
-                    $this->block_pointers[$i] = $info_block[$i];
-                }
-            }
-
-            if($this->num_docs_generation > COMMON_WORD_THRESHOLD) {
-                if($info_block[self::LIST_OFFSET] === NULL) {
-                    $this->list_offset = NULL;
-                } else {
-                    $this->list_offset = $info_block[self::LIST_OFFSET];
-                }
-            }
-
-        } else {
-            $this->num_docs = 0;
-            $this->num_docs_generation = 0;
-            $this->current_pointer = -1;
-        }
-        return true;
-    }

-    /**
-     * Gets the block of doc summaries associated with the current doc
-     * pointer and which match the array of additional word restrictions
-     * @param array $restrict_phrases an array of additional words or phrases
-     *      to see if contained in summary
-     * @return array doc summaries that match
-     */
-    public function currentDocsWithWord($restrict_phrases = NULL)
-    {
-        if($this->num_generations <=
-            $this->info_block['CURRENT_GENERATION_INDEX']) {
-            return -1;
-        }
-        $generation =
-            $this->info_block['GENERATIONS'][
-                $this->info_block['CURRENT_GENERATION_INDEX']];
-        if($this->current_pointer >= 0) {
-            if($this->current_pointer == $this->num_full_blocks) {
-                $pages = $this->last_block;
-            } else if ($this->current_pointer >= $this->last_pointed_block) {
-                /* if there are more than COMMON_WORD_THRESHOLD many
-                   results and we're not at the last block yet
-                 */
-                if($this->list_offset === NULL) {
-                    return -1;
-                }
-                $offset = $this->list_offset;
-                $found = false;
-                do {
-                    /* the link list is actually backwards to the order we want
-                       For now, we cycle along the list from the last data
-                       stored until we find the block we want. This is slow
-                       but we are relying on the fact that each generation is
-                       not too big.
-                     */
-                    $doc_block = $this->index->getWordDocBlock($this->word_key,
-                        $offset, $generation);
-                    $word_keys = array_keys($doc_block);
-                    $found_key = NULL;
-                    foreach($word_keys as $word_key) {
-                        if(strstr($word_key, $this->word_key.":")) {
-                            $found_key = $word_key;
-                            if(isset($doc_block[
-                                $found_key][self::LIST_OFFSET])) {
-                                //only one list offset/docblock
-                                break;
-                            }
-                        }
-                    }
-                    if($found_key === NULL) {
-                        break;
-                    }
-                    if(isset($doc_block[
-                        $this->word_key.":".$this->current_pointer])) {
-                        $found = true;
-                        break;
-                    }
-                    $offset = $doc_block[$found_key][self::LIST_OFFSET];
-                } while($offset != NULL);
-                if($found != true) {
-                    $pages = array();
-                } else {
-                    $pages = $doc_block[
-                        $this->word_key.":".$this->current_pointer];
-                }
-            } else {
-                //first COMMON_WORD_THRESHOLD many results fast
-                if(isset($this->block_pointers[$this->current_pointer])) {
-                    $doc_block = $this->index->getWordDocBlock($this->word_key,
-                        $this->block_pointers[$this->current_pointer],
-                        $generation);
-                    if(isset(
-                        $doc_block[$this->word_key.":".$this->current_pointer]
-                        )) {
-                        $pages =
-                            $doc_block[
-                                $this->word_key.":".$this->current_pointer];
-                    } else {
-                        $pages = array();
-                    }
-                } else {
-                    $pages = array();
-                }
-            }
-
-            if($this->seen_docs < $this->limit) {
-                $diff_offset = $this->limit - $this->seen_docs;
-
-                $pages = array_slice($pages, $diff_offset);
-            }
-            $this->count_block_unfiltered = count($pages);
-
-            if($restrict_phrases != NULL) {
-
-                 $out_pages = array();
-                 if(count($pages) > 0 ) {
-                     foreach($pages as $doc_key => $doc_info) {
-
-                         if(isset($doc_info[self::SUMMARY_OFFSET])) {
-
-                             $page = $this->index->getPage(
-                                $doc_key, $doc_info[self::SUMMARY_OFFSET]);
-                             /* build a string out of title, links,
-                                and description
-                              */
-                             $page_string = mb_strtolower(
-                                PhraseParser::extractWordStringPageSummary(
-                                    $page));
-
-                             $found = true;
-                             foreach($restrict_phrases as $phrase) {
-                                 if(mb_strpos($page_string, $phrase)
-                                    === false) {
-                                     $found = false;
-                                 }
-                             }
-                             if($found == true) {
-                                 $out_pages[$doc_key] = $doc_info;
-                             }
-                         }
-                     }
-                 }
-                 $pages = $out_pages;
-            }
-            return $pages;
-        } else {
-            return -1;
-        }
-    }
-
-    /**
-     * Get the current block of doc summaries for the word iterator and advances
-     * the current pointer to the next block
-     *
-     * @param array $restrict_phrases additional words to restrict doc summaries
-     *      returned
-     * @return array doc summaries matching the $restrict_phrases
-     */
-    public function nextDocsWithWord($restrict_phrases = NULL)
-    {
-        $doc_block = $this->currentDocsWithWord($restrict_phrases);
-        if($this->seen_docs <  $this->limit) {
-            $this->seen_docs = $this->count_block_unfiltered + $this->limit;
-        } else {
-        	    $this->seen_docs += $this->count_block_unfiltered;
-        }
-        $this->restricted_seen_docs += count($doc_block);
-        if($doc_block == -1 || !is_array($doc_block)) {
-            return NULL;
-        }
-
-        $this->current_pointer ++;
-        if($this->current_pointer > $this->num_full_blocks) {
-            $flag = false;
-            while ($this->info_block['CURRENT_GENERATION_INDEX'] <
-                $this->num_generations - 1 && !$flag) {
-                $this->info_block['CURRENT_GENERATION_INDEX']++;
-                $flag = $this->initGeneration();
-            }
-            if ($this->info_block['CURRENT_GENERATION_INDEX'] >=
-                $this->num_generations - 1) {
-                $this->current_pointer = - 1;
-            }
-        }
-
-        return $doc_block;
-
-    }
-
-}

 /**
  * Encapsulates a set of web page summaries and an inverted word-index of terms
@@ -679,6 +241,8 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
         }
         $this->summaries = new WebArchiveBundle($dir_name."/summaries",
             $filter_size, $num_partitions_summaries, $description);
+        $this->summaries->initCountIfNotExists("VISITED_URLS_COUNT");
+
         $this->num_partitions_summaries = $this->summaries->num_partitions;

         $this->index = new WebArchiveBundle(
@@ -699,12 +263,16 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
      * @param string $key_field field used to select partition
      * @param string $offset_field field used to record offsets after storing
      * @param array &$pages data to store
+     * @param int $visited_urls_count number to add to the count of visited urls
+     *      (visited urls is a smaller number than the total count of objects
+     *      stored in the index).
      * @return array $pages adjusted with offset field
      */
-    public function addPages($key_field, $offset_field, $pages)
+    public function addPages($key_field, $offset_field, $pages,
+        $visited_urls_count)
     {
         $result = $this->summaries->addPages($key_field, $offset_field, $pages);
-
+        $this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT");
         return $result;
     }

@@ -828,7 +396,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

             $tmp =
                 array_merge($block_data[$word_key][self::END_BLOCK],$docs_info);
-            uasort($tmp, "scoreOrderCallback");
+            uasort($tmp, "docRankOrderCallback");
             $add_cnt = count($tmp);
             $num_blocks = floor($add_cnt / BLOCK_SIZE);
             $block_data[$word_key][self::END_BLOCK] =
@@ -936,63 +504,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
         return true;
     }

-    /**
-     * Gets doc summaries of documents containing a given word and meeting the
-     * additional provided criteria
-     * @param string $word_key the word to iterate over to get document results
-     *      of
-     * @param int $limit number of first document in order to return
-     * @param int $num number of documents to return summaries of
-     * @param array $restrict_phrases additional words and phrase to store
-     *      further restrict the search
-     * @param string $phrase_key a hash of the word and restricted phrases to
-     *      store the results of the look up
-     * @param array $phrase_info info block of the word
-     * @return array document summaries
-     */
-    public function getSummariesByHash($word_key, $limit, $num,
-        $restrict_phrases = NULL, $phrase_key = NULL, $phrase_info = NULL)
-    {
-        if($phrase_key ==  NULL) {
-            $phrase_key = $word_key;
-        }
-
-        if($phrase_info == NULL) {
-            $phrase_info = $this->getPhraseIndexInfo($phrase_key);
-        }
-
-        if($phrase_info == NULL || (isset($phrase_info[self::PARTIAL_COUNT])
-            && $phrase_info[self::PARTIAL_COUNT] < $limit + $num)) {
-            $this->addPhraseIndex(
-                $word_key, $restrict_phrases, $phrase_key, $limit + $num);
-        }
-
-        $iterator = new WordIterator($phrase_key, $this, $limit, $phrase_info);
-
-        $num_retrieved = 0;
-        $pages = array();
-
-         while(is_array($next_docs = $iterator->nextDocsWithWord()) &&
-            $num_retrieved < $num) {
-             $num_docs_in_block = count($next_docs);
-
-             foreach($next_docs as $doc_key => $doc_info) {
-                 if(isset($doc_info[self::SUMMARY_OFFSET])) {
-                     $page = $this->getPage(
-                        $doc_key, $doc_info[self::SUMMARY_OFFSET]);
-                     $pages[] = array_merge($doc_info, $page);
-                     $num_retrieved++;
-                 }
-                 if($num_retrieved >=  $num) {
-                     break 2;
-                 }
-             }
-         }
-        $results['TOTAL_ROWS'] = $iterator->num_docs;
-        $results['PAGES'] = $pages;
-        return $results;
-    }
-
     /**
      * Gets the page out of the summaries WebArchiveBundle with the given
      * key and offset
@@ -1108,7 +619,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
             WebArchiveBundle::selectPartition(
                 $phrase_key, $this->num_partitions_index);
         $info = array();
-
         if($info_block == NULL) {

             if(!$this->initPartitionIndexFilter($partition)) {
@@ -1116,7 +626,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
             }
             $filter = & $this->index_partition_filters[$partition];

-            if(!$filter->contains($phrase_key)) {
+            if($filter == NULL || !$filter->contains($phrase_key)) {
                 return NULL;
             }

@@ -1196,86 +706,6 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants

     }

-    /**
-     * Adds the supplied phrase to the IndexArchiveBundle.
-     *
-     * The most selective word in the phrase is $word_key, the additional
-     * words are in $restrict_phrases, the hash of the phrase to add is
-     * $phrase_key, and if the will be a lot of results compute at least
-     * the first $num_needed.
-     *
-     * @param string $word_key hash of most selective word in phrase
-     * @param array $restrict_phrases additional words in phrase
-     * @param string $phrase_key hash of phrase to add
-     * @param $num_needed minimum number of doc results to save if possible
-     */
-    public function addPhraseIndex($word_key, $restrict_phrases,
-        $phrase_key, $num_needed)
-    {
-        if($phrase_key == NULL) {
-            return;
-        }
-
-        $partition =
-            WebArchiveBundle::selectPartition($phrase_key,
-                $this->num_partitions_index);
-
-        $iterator = new WordIterator($word_key, $this);
-        $current_count = 0;
-        $buffer = array();
-        $word_data = array();
-        $partial_flag = false;
-        $first_time = true;
-
-        while(is_array($next_docs =
-            $iterator->nextDocsWithWord($restrict_phrases))) {
-            $buffer = array_merge($buffer, $next_docs);
-            $cnt = count($buffer);
-
-            if($cnt > COMMON_WORD_THRESHOLD) {
-                $word_data[$phrase_key] =
-                    array_slice($buffer, 0, COMMON_WORD_THRESHOLD);
-
-                $this->addPartitionWordData($partition, $word_data, $first_time);
-                $first_time = false;
-                $buffer = array_slice($buffer, COMMON_WORD_THRESHOLD);
-                $current_count += COMMON_WORD_THRESHOLD;
-
-                if($current_count > $num_needed) {
-                    /* notice $num_needed only plays a role when
-                      greater than COMMON_WORD_THRESHOLD
-                     */
-                    $partial_flag = true;
-                    break;
-                }
-             }
-        }
-
-        $word_data[$phrase_key] = $buffer;
-
-        $this->addPartitionIndexFilter(
-            $partition,
-            "delete". $phrase_key . ($this->generation_info['ACTIVE'] - 1));
-
-        $this->addPartitionWordData($partition, $word_data);
-        $this->addPartitionIndexFilter($partition, $phrase_key);
-        $this->addPartitionIndexFilter($partition, $phrase_key .
-            $this->generation_info['ACTIVE']);
-        $this->index_partition_filters[$partition]->save();
-        file_put_contents($this->dir_name."/generation.txt",
-            serialize($this->generation_info));
-
-        $block_info = $this->readPartitionInfoBlock($partition);
-        $info = $block_info[$phrase_key];
-        $current_count += count($buffer);
-        if($partial_flag) {
-            $info[self::PARTIAL_COUNT] = $current_count;
-            $info[self::COUNT] =
-                floor($current_count*$iterator->num_docs/$iterator->seen_docs);
-            $this->setPhraseIndexInfo($phrase_key, $info);
-        }
-    }
-
     /**
      * Computes the words which appear in the fewest or most documents
      *
@@ -1296,6 +726,7 @@ class IndexArchiveBundle implements IndexingConstants, CrawlConstants
                 $words_array[$word_key] = $info['TOTAL_COUNT'];
             } else {
                 $words_array[$word_key] = 0;
+                return NULL;
             }
         }

diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
new file mode 100644
index 000000000..c92ae4098
--- /dev/null
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -0,0 +1,361 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage library
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ *Loads common constants for word indexing
+ */
+require_once BASE_DIR.'/lib/indexing_constants.php';
+
+/**
+ *Loads base class for iterating
+ */
+require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
+
+/**
+ * This iterator is used to group together documents or document parts
+ * which share the same url. For instance, a link document item and
+ * the document that it links to will both be stored in the IndexArchiveBundle
+ * by the QueueServer. This iterator would combine both these items into
+ * a single document result with a sum of their score, and a summary, if
+ * returned, containing text from both sources. The iterator's purpose is
+ * vaguely analagous to a SQL GROUP BY clause
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage library
+ * @see IndexArchiveBundle
+ */
+class GroupIterator extends IndexBundleIterator
+{
+    /**
+     * The iterator we are using to get documents from
+     * @var string
+     */
+    var $index_bundle_iterator;
+
+    /**
+     * The number of documents in the current block before filtering
+     * by restricted words
+     * @var int
+     */
+    var $count_block_unfiltered;
+    /**
+     * The number of documents in the current block after filtering
+     * by restricted words
+     * @var int
+     */
+    var $count_block;
+
+    var $current_block_hashes;
+
+    /**
+     * The number of iterated docs before the restriction test
+     * @var int
+     */
+    var $seen_docs_unfiltered;
+
+    /**
+     * hashed url keys used to keep track of track of groups seen so far
+     * @var array
+     */
+    var $grouped_keys;
+
+
+    /**
+     * Creates a group iterator with the given parameters.
+     *
+     * @param object $index_bundle_iterator to use as a source of documents
+     *      to iterate over
+     * @param int $limit the first element to return from the list of docs
+     *      iterated over
+     */
+    function __construct($index_bundle_iterator, $limit = 0)
+    {
+        $this->index_bundle_iterator = $index_bundle_iterator;
+        $this->limit = $limit;
+        $this->num_docs = $this->index_bundle_iterator->num_docs;
+        $this->reset();
+    }
+
+    /**
+     * Returns the iterators to the first document block that it could iterate
+     * over
+     */
+    function reset()
+    {
+        $this->index_bundle_iterator->reset();
+        $time = time();
+        $this->grouped_keys = array();
+            // -1 == never save, so file name not used using time to be safer
+        $this->seen_docs = 0;
+        $this->seen_docs_unfiltered = 0;
+        $beneath_limit = true;
+        while($beneath_limit == true) {
+
+            $doc_block = $this->currentDocsWithWord();
+            if($doc_block == -1 || !is_array($doc_block)) {
+                $beneath_limit = false;
+                continue;
+            }
+            if($this->seen_docs + $this->count_block >= $this->limit) {
+                $beneath_limit = false;
+                continue;
+            }
+            $this->advance();
+        }
+
+    }
+
+    /**
+     * Hook function used by currentDocsWithWord to return the current block
+     * of docs if it is not cached
+     *
+     * @return mixed doc ids and score if there are docs left, -1 otherwise
+     */
+    function findDocsWithWord()
+    {
+        $pages =
+            $this->index_bundle_iterator->currentDocsWithWord();
+        $this->count_block_unfiltered = count($pages);
+        if(!is_array($pages)) {
+            return $pages;
+        }
+
+        $this->current_block_hashes = array();
+        $pre_out_pages = array();
+
+        if($this->count_block_unfiltered > 0 ) {
+            $i = $this->seen_docs;
+            foreach($pages as $doc_key => $doc_info) {
+                if(!is_array($doc_info)) {continue;}
+                $doc_info['KEY'] = $doc_key;
+                $doc_key_parts = explode(":", $doc_key);
+                if(count($doc_key_parts) == 1) {
+                    $hash_url = $doc_key_parts[0];
+                    $doc_info['IS_PAGE'] = true;
+                } else {
+                    $hash_url = $doc_key_parts[1];
+                    $doc_info['IS_PAGE'] = false;
+                }
+                if(isset($this->grouped_keys[$hash_url])) {
+                    if( $i < $this->limit) {
+                        continue;
+                    } else {
+                        if(isset($pre_out_pages[$hash_url]) ) {
+                            $pre_out_pages[$hash_url][] = $doc_info;
+                            if($doc_info['IS_PAGE'] == true) {
+                                $pre_out_pages[$hash_url]['IS_PAGE'] = true;
+                            } else {
+                                $pre_out_pages[$hash_url]['HASH_INFO_URL'] =
+                                    $doc_key_parts[2];
+                            }
+                        }
+                    }
+                } else {
+
+                    $pre_out_pages[$hash_url][] = $doc_info;
+                    if($doc_info['IS_PAGE'] == true) {
+                        $pre_out_pages[$hash_url]['IS_PAGE'] = true;
+                    } else {
+                        $pre_out_pages[$hash_url]['HASH_INFO_URL'] =
+                            $doc_key_parts[2];
+                    }
+                    $this->current_block_hashes[] = $hash_url;
+                    $i++;
+                }
+            }
+             //get summary page for groups of link data if exists and don't have
+            foreach($pre_out_pages as $hash_url => $data) {
+                if(!isset($data['IS_PAGE'])) {
+                    $hash_info_url= $pre_out_pages[$hash_url]['HASH_INFO_URL'];
+                    $word_iterator =
+                         new WordIterator($hash_info_url,
+                            $this->getIndex(), 0);
+                    $doc_array = $word_iterator->currentDocsWithWord();
+                    if(is_array($doc_array) && count($doc_array) == 1) {
+                        $keys = array_keys($doc_array);
+                        $key = $keys[0];
+                        if($doc_array[$key][self::SCORE] > 0) {
+                            $pre_out_pages[$hash_url][$key] = $doc_array[$key];
+                            $pre_out_pages[$hash_url][$key]['IS_PAGE'] = true;
+                        } else {
+                            unset($pre_out_pages[$hash_url]);
+                        }
+                    }
+                } else {
+                    unset($pre_out_pages[$hash_url]['IS_PAGE']);
+                }
+                if(isset($pre_out_pages[$hash_url]['HASH_INFO_URL'])) {
+                    unset($pre_out_pages[$hash_url]['HASH_INFO_URL']);
+                }
+            }
+            $this->count_block = count($pre_out_pages);
+
+            if($this->seen_docs  <  $this->limit) {
+                $total_docs = $this->seen_docs + $this->count_block;
+                if($total_docs <  $this->limit) {
+                    $pre_out_pages =array();
+                } else {
+                    $pre_out_pages = array_slice($pre_out_pages,
+                        $this->limit - $this->seen_docs, NULL, true);
+                }
+            }
+            $out_pages = array();
+            foreach($pre_out_pages as $hash_url => $group_infos) {
+                foreach($group_infos as $doc_info) {
+                    $is_page = $doc_info['IS_PAGE'];
+                    unset($doc_info['IS_PAGE']);
+                    if(!isset($out_pages[$hash_url])) {
+                        $out_pages[$hash_url] = $doc_info;
+                        $out_pages[$hash_url][self::SUMMARY_OFFSET] = array();
+                        if(isset($doc_info[self::SUMMARY_OFFSET]) ) {
+                            $out_pages[$hash_url][self::SUMMARY_OFFSET] =
+                                array(array($doc_info["KEY"],
+                                    $doc_info[self::SUMMARY_OFFSET]));
+                            unset($out_pages[$hash_url]["KEY"]);
+                        }
+                    } else {
+                        $fields = array_keys($out_pages[$hash_url]);
+                        foreach($fields as $field) {
+                            if(isset($doc_info[$field]) &&
+                                $field != self::SUMMARY_OFFSET) {
+                                $out_pages[$hash_url][$field] +=
+                                    $doc_info[$field];
+                            } else if($field == self::SUMMARY_OFFSET &&
+                                $is_page == true) {
+                                array_unshift($out_pages[$hash_url][$field],
+                                    array($hash_url, $doc_info[$field]));
+                            } else if($field == self::SUMMARY_OFFSET) {
+                                $out_pages[$hash_url][$field][] =
+                                    array($doc_info["KEY"], $doc_info[$field]);
+                            }
+                        }
+                    }
+                }
+            }
+            $pages = $out_pages;
+        }
+        $this->pages = $pages;
+        return $pages;
+
+    }
+
+    /**
+     * Gets the summaries associated with the keys provided the keys
+     * can be found in the current block of docs returned by this iterator
+     * @param array $keys keys to try to find in the current block of returned
+     *      results
+     * @return array doc summaries that match provided keys
+     */
+    function getSummariesFromCurrentDocs($keys = NULL)
+    {
+        if($this->current_block_fresh == false) {
+            $result = $this->currentDocsWithWord();
+            if(!is_array($result)) {
+                return $result;
+            }
+        }
+        if(!is_array($this->pages)) {
+            return $this->pages;
+        }
+        if($keys == NULL) {
+            $keys = array_keys($this->pages);
+        }
+        $out_pages = array();
+        foreach($keys as $doc_key) {
+            if(!isset($this->pages[$doc_key])) {
+                continue;
+            } else {
+                $doc_info = $this->pages[$doc_key];
+            }
+            if(isset($doc_info[self::SUMMARY_OFFSET]) &&
+                is_array($doc_info[self::SUMMARY_OFFSET])) {
+                $out_pages[$doc_key] = $doc_info;
+                foreach($doc_info[self::SUMMARY_OFFSET] as $offset_array) {
+                    list($key, $summary_offset) = $offset_array;
+                    $index = $this->getIndex($key);
+                    $page = $index->getPage(
+                        $key, $summary_offset);
+                    if(!isset($out_pages[$doc_key][self::SUMMARY])) {
+                        $out_pages[$doc_key][self::SUMMARY] = $page;
+                    } else if (isset($page[self::DESCRIPTION])) {
+                        $out_pages[$doc_key][self::SUMMARY][self::DESCRIPTION].=
+                            " .. ".$page[self::DESCRIPTION];
+                    }
+                }
+            }
+        }
+        return $out_pages;
+
+    }
+
+
+
+    /**
+     * Forwards the iterator one group of docs
+     */
+    function advance()
+    {
+        $this->advanceSeenDocs();
+
+        	$this->seen_docs_unfiltered += $this->count_block_unfiltered;
+
+        if($this->seen_docs_unfiltered > 0) {
+            $this->num_docs =
+                floor(($this->seen_docs*$this->index_bundle_iterator->num_docs)/
+                $this->seen_docs_unfiltered);
+        } else {
+            $this->num_docs = 0;
+        }
+
+
+        foreach($this->current_block_hashes as $hash_url) {
+            $this->grouped_keys[$hash_url] = true;
+        }
+
+        $this->index_bundle_iterator->advance();
+
+    }
+
+    /**
+     * Returns the index associated with this iterator
+     * @return object the index
+     */
+    function getIndex($key = NULL)
+    {
+        return $this->index_bundle_iterator->getIndex($key);
+    }
+}
+?>
diff --git a/lib/index_bundle_iterators/index_bundle_iterator.php b/lib/index_bundle_iterators/index_bundle_iterator.php
new file mode 100644
index 000000000..6759e80fa
--- /dev/null
+++ b/lib/index_bundle_iterators/index_bundle_iterator.php
@@ -0,0 +1,215 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage library
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ *Loads common constants for word indexing
+ */
+require_once BASE_DIR.'/lib/indexing_constants.php';
+
+/**
+ * Abstract classed used to model iterating documents indexed in
+ * an IndexArchiveBundle or set of such bundles.
+ *
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage library
+ * @see IndexArchiveBundle
+ */
+abstract class IndexBundleIterator implements IndexingConstants, CrawlConstants
+{
+
+    /**
+     * Estimate of the number of documents that this iterator can return
+     * @var int
+     */
+    var $num_docs;
+
+    /**
+     * The number of documents already iterated over
+     * @var int
+     */
+    var $seen_docs;
+
+    /**
+     * First document that should be returned
+     * amongst all of the documents associated with the
+     * iterator's $word_key
+     * @var int
+     */
+    var $limit;
+    /**
+     * The number of documents in the current block
+     * @var int
+     */
+    var $count_block;
+
+    /**
+     * Cache of what currentDocsWithWord returns
+     * @var array
+     */
+    var $pages;
+
+    /**
+     * Says whether the value in $this->count_block is up to date
+     * @var bool
+     */
+    var $current_block_fresh;
+
+
+
+    /**
+     * Returns the iterators to the first document block that it could iterate
+     * over
+     */
+    abstract function reset();
+
+    /**
+     * Forwards the iterator one group of docs
+     */
+    abstract function advance();
+    /**
+     * Returns the index associated with this iterator
+     * @return object the index
+     */
+    abstract function getIndex($key = NULL);
+
+    /**
+     * Hook function used by currentDocsWithWord to return the current block
+     * of docs if it is not cached
+     *
+     * @return mixed doc ids and score if there are docs left, -1 otherwise
+     */
+     abstract function findDocsWithWord();
+
+    /**
+     * Gets the current block of doc ids and score associated with the
+     * this iterators word
+     *
+     * @param bool $with_summaries specifies whether or not to return the
+     *      summaries associated with the document
+     * @return mixed doc ids and score if there are docs left, -1 otherwise
+     */
+    function currentDocsWithWord()
+    {
+        if($this->current_block_fresh == true) {
+            return $this->pages;
+        }
+        $this->current_block_fresh = true;
+        return $this->findDocsWithWord();
+    }
+
+    /**
+     * Gets the summaries associated with the keys provided the keys
+     * can be found in the current block of docs returned by this iterator
+     * @param array $keys keys to try to find in the current block of returned
+     *      results
+     * @return array doc summaries that match provided keys
+     */
+    function getSummariesFromCurrentDocs($keys = NULL)
+    {
+
+        $index = $this->getIndex();
+        if($this->current_block_fresh == false) {
+            $pages = $this->currentDocsWithWord();
+            if(!is_array($pages)) {
+                return $pages;
+            }
+        } else {
+            $pages = & $this->pages;
+        }
+        if($keys == NULL) {
+            if(is_array($pages)) {
+                $keys = array_keys($pages);
+            } else {
+                return NULL;
+            }
+        }
+        $out_pages = array();
+
+        foreach($keys as $doc_key) {
+            if(!isset($pages[$doc_key])) {
+                continue;
+            } else {
+                $doc_info = $pages[$doc_key];
+            }
+            if(isset($doc_info[self::SUMMARY_OFFSET])) {
+                $page = $index->getPage(
+                    $doc_key, $doc_info[self::SUMMARY_OFFSET]);
+                $out_pages[$doc_key] = $doc_info;
+                $out_pages[$doc_key][self::SUMMARY] = $page;
+            }
+        };
+        return $out_pages;
+    }
+
+    /**
+     * Get the current block of doc summaries for the word iterator and advances
+     * the current pointer to the next block
+     *
+     * @return array doc summaries matching the $this->restrict_phrases
+     */
+    function nextDocsWithWord()
+    {
+        $doc_block = $this->getSummariesFromCurrentDocs();
+
+        if($doc_block == -1 || !is_array($doc_block) ) {
+            return NULL;
+        }
+
+        $this->advance();
+
+        return $doc_block;
+
+    }
+
+    /**
+     * Updates the seen_docs count during an advance() call
+     */
+    function advanceSeenDocs()
+    {
+
+        if($this->current_block_fresh != true) {
+            $doc_block = $this->currentDocsWithWord();
+            if($doc_block == -1 || !is_array($doc_block) ) {
+                return;
+            }
+        }
+        $this->current_block_fresh = false;
+        $this->seen_docs += $this->count_block;
+    }
+
+}
+?>
diff --git a/lib/index_bundle_iterators/intersect_iterator.php b/lib/index_bundle_iterators/intersect_iterator.php
new file mode 100644
index 000000000..8a4d6d5b5
--- /dev/null
+++ b/lib/index_bundle_iterators/intersect_iterator.php
@@ -0,0 +1,302 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage library
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ *Loads common constants for word indexing
+ */
+require_once BASE_DIR.'/lib/indexing_constants.php';
+
+/**
+ *Loads BloomFilterFile to remember things we've already grouped
+ */
+require_once BASE_DIR.'/lib/bloom_filter_file.php';
+
+
+/**
+ *Loads base class for iterating
+ */
+require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
+
+/**
+ * Used to iterate over the documents which occur in all of a set of
+ * WordIterator results
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage library
+ * @see IndexArchiveBundle
+ */
+class IntersectIterator extends IndexBundleIterator
+{
+    /**
+     * An array of iterators whose interection we  get documents from
+     * @var array
+     */
+    var $index_bundle_iterators;
+    /**
+     * Number of elements in $this->index_bundle_iterators
+     * @var int
+     */
+    var $num_iterators;
+
+    /**
+     * The number of documents in the current block before filtering
+     * by restricted words
+     * @var int
+     */
+    var $count_block_unfiltered;
+    /**
+     * The number of documents in the current block after filtering
+     * by restricted words
+     * @var int
+     */
+    var $count_block;
+
+    /**
+     * The number of iterated docs before the restriction test
+     * @var int
+     */
+    var $seen_docs_unfiltered;
+
+    /**
+     * Index of the iterator amongst those we are intersecting to advance
+     * next
+     * @var int
+     */
+    var $to_advance_index;
+
+    /**
+     * Creates an intersect iterator with the given parameters.
+     *
+     * @param object $index_bundle_iterator to use as a source of documents
+     *      to iterate over
+     * @param int $limit the first element to return from the list of docs
+     *      iterated over
+     */
+    function __construct($index_bundle_iterators, $limit = 0)
+    {
+        $this->index_bundle_iterators = $index_bundle_iterators;
+        $this->limit = $limit;
+
+        $this->num_iterators = count($index_bundle_iterators);
+        $this->num_docs = -1;
+
+        /*
+             the most results we can return is the size of the least num_docs
+             of what we are itrerating over
+        */
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            if( $this->num_docs < 0 ||
+                $this->index_bundle_iterators[$i]->num_docs < $this->num_docs) {
+                $this->num_docs = $this->index_bundle_iterators[$i]->num_docs;
+            }
+        }
+        $this->reset();
+    }
+
+    /**
+     * Returns the iterators to the first document block that it could iterate
+     * over
+     */
+    function reset()
+    {
+        foreach($this->index_bundle_iterators as $iterator) {
+            $iterator->reset();
+        }
+
+        $this->seen_docs = 0;
+        $this->seen_docs_unfiltered = 0;
+        $beneath_limit = true;
+        while($beneath_limit == true) {
+            $doc_block = $this->currentDocsWithWord();
+            if($doc_block == -1 || !is_array($doc_block)) {
+                $beneath_limit = false;
+                continue;
+            }
+            if($this->seen_docs + $this->count_block >= $this->limit) {
+                $beneath_limit = false;
+                continue;
+            }
+            $this->advance();
+        }
+    }
+
+    /**
+     * Hook function used by currentDocsWithWord to return the current block
+     * of docs if it is not cached
+     *
+     * @return mixed doc ids and rank if there are docs left, -1 otherwise
+     */
+    function findDocsWithWord()
+    {
+        $pages = array();
+        $high_ranks = array();
+        $last = $this->num_iterators - 1;
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            $pages[$i] =
+                $this->index_bundle_iterators[$i]->currentDocsWithWord();
+            if(!is_array($pages[$i]) || count($pages[$i]) == 0) {
+                $this->to_advance_index = $i;
+                return $pages[$i];
+            }
+            list($low_ranks[$i], $high_ranks[$i]) =
+                $this->lowHighRanks($pages[$i], $i);
+        }
+        uasort($low_ranks, "docRankOrderCallback");
+
+       $low_ranks = array_values($low_ranks);
+
+       $low_rank = $low_ranks[$last][self::DOC_RANK];
+
+       $this->to_advance_index = $low_ranks[0]["INDEX"];
+       $this->count_block_unfiltered = count($pages[$this->to_advance_index]);
+
+        $docs = array();
+        $looping = true;
+
+        while ($looping == true) {
+            for($i = 0; $i <= $last; $i++) {
+            list( ,$high_ranks[$i]) =
+                $this->lowHighRanks($pages[$i], $i, false);
+            }
+            $broke = false;
+            $score = 0;
+            $high_rank = $high_ranks[0][self::DOC_RANK];
+            $high_key = $high_ranks[0]["KEY"];
+            $high_index = $high_ranks[0]["INDEX"];
+            $to_deletes = array();
+            for($i = 1; $i <= $last; $i++) {
+                if($high_ranks[$i][self::DOC_RANK] < $low_rank ) {
+                    $looping = false;
+                    break 2;
+                }
+                if($high_ranks[$i][self::DOC_RANK] > $high_rank ||
+                    ($high_ranks[$i][self::DOC_RANK] == $high_rank &&
+                        strcmp($high_ranks[$i]["KEY"], $high_key) > 0)
+                    ) {
+                    $broke = true;
+                    $high_rank = $high_ranks[$i][self::DOC_RANK];
+                    $high_index = $high_ranks[$i]["INDEX"];
+                    $high_key = $high_ranks[$i]["KEY"];
+                    $to_deletes[$high_index] = $high_key;
+                }
+                $score += $high_ranks[$i][self::SCORE];
+            }
+            if($broke == false) {
+                $docs[$high_key] = $pages[$high_index][$high_key];
+                $docs[$high_key][self::SCORE] = $score;
+                $to_deletes[$high_index] = $high_key;
+            }
+
+            foreach($to_deletes as $index => $key) {
+                unset($pages[$index][$key]);
+                if(count($pages[$index]) == 0) {
+                    $looping = false;
+                }
+            }
+
+        }
+        $this->count_block = count($docs);
+        $this->pages = $docs;
+        return $docs;
+    }
+
+    /**
+     * Given a collection of documents, returns info about the low and high
+     * ranking documents. Namely, their ranks, keys,
+     * index in word iterator array, and scores
+     *
+     * @param array &$docs documents to get low high info from
+     * @param int $index which word iterator these docs came from
+     * @param boo $sort_flag whether to sort the docs (if true) or to assume
+     *      the docs are already sorted by rank
+     * @return array desired info
+     */
+    function lowHighRanks(&$docs, $index, $sort_flag = true)
+    {
+        if($sort_flag == true) {
+            uasort($docs, "docRankOrderCallback");
+        }
+        reset($docs);
+        $high = array();
+        $high["KEY"] = key($docs);
+        $high[self::DOC_RANK] = $docs[$high["KEY"]][self::DOC_RANK];
+        $high[self::SCORE] = $docs[$high["KEY"]][self::SCORE];
+        $high["INDEX"] = $index;
+        end($docs);
+        $low = array();
+        $low["KEY"] = key($docs);
+        $low[self::DOC_RANK] =  $docs[$low["KEY"]][self::DOC_RANK];
+        $low[self::SCORE] =  $docs[$low["KEY"]][self::SCORE];
+        $low["INDEX"] = $index;
+        return array($low, $high);
+    }
+
+    /**
+     * Forwards the iterator one group of docs
+     */
+    function advance()
+    {
+        $this->advanceSeenDocs();
+
+        	$this->seen_docs_unfiltered += $this->count_block_unfiltered;
+
+        $min_num_docs = 10000000000;
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            if($this->index_bundle_iterators[$i]->num_docs < $min_num_docs) {
+                $min_num_docs = $this->index_bundle_iterators[$i]->num_docs;
+            }
+        }
+        if($this->seen_docs_unfiltered > 0) {
+            $this->num_docs =
+                floor(($this->seen_docs * $min_num_docs) /
+                $this->seen_docs_unfiltered);
+        } else {
+            $this->num_docs = 0;
+        }
+        $this->index_bundle_iterators[$this->to_advance_index]->advance();
+
+    }
+
+    /**
+     * Returns the index associated with this iterator
+     * @return object the index
+     */
+    function getIndex($key = NULL)
+    {
+        return $this->index_bundle_iterators[0]->getIndex($key = NULL);
+    }
+}
+?>
diff --git a/lib/index_bundle_iterators/phrase_filter_iterator.php b/lib/index_bundle_iterators/phrase_filter_iterator.php
new file mode 100644
index 000000000..943a84188
--- /dev/null
+++ b/lib/index_bundle_iterators/phrase_filter_iterator.php
@@ -0,0 +1,311 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage library
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ *Loads common constants for word indexing
+ */
+require_once BASE_DIR.'/lib/indexing_constants.php';
+
+/**
+ *Loads base class for iterating
+ */
+require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
+
+/**
+ * Used to iterate through a collection of documents to return only those
+ * which have certain restricted_phrases and don't have disallowed_phrases.
+ *
+ * For restricted_phrases a string like "Chris * Homepage" will match any
+ * string where * has been replace by any other string. So for example it will
+ * match Chris Pollett's Homepage.
+ *
+ * disallowed_phrases are really just disallowed words and must be an exact
+ * match
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage library
+ * @see IndexArchiveBundle
+ */
+class PhraseFilterIterator extends IndexBundleIterator
+{
+    /**
+     * The iterator we are using to get documents from
+     * @var string
+     */
+    var $index_bundle_iterator;
+
+    /**
+     * This iterator returns only documents containing all the elements of
+     * restrict phrases
+     * @var array
+     */
+    var $restrict_phrases;
+
+    /**
+     * This iterator returns only documents not containing any the elements of
+     * disallow phrases
+     * @var array
+     */
+    var $disallow_phrases;
+    /**
+     * The number of documents in the current block before filtering
+     * by restricted words
+     * @var int
+     */
+    var $count_block_unfiltered;
+
+    /**
+     * The number of iterated docs before the restriction test
+     * @var int
+     */
+    var $seen_docs_unfiltered;
+
+    /**
+     * Doc block with summaries for current doc block
+     * @var array
+     */
+    var $summaries;
+
+    /**
+     * A weighting factor to multiply with each doc SCORE returned from this
+     * iterator
+     * @var float
+     */
+    var $weight;
+
+    /**
+     * Creates a phrase filter iterator with the given parameters.
+     *
+     * @param object $index_bundle_iterator to use as a source of documents
+     *      to iterate over
+     * @param array $restrict_phrases this iterator returns only documents from
+     *      $index_bundle_iterator containing all the elements of restrict
+     *      phrases
+     * @param array $disallow_phrases this iterator returns only documents from
+     *      $index_bundle_iterator not containing any of the words in disallow
+     *      phrases
+     * @param float $weight a quantity to multiply each score returned from
+     *      this iterator with
+     * @param int $limit the first element to return from the list of docs
+     *      iterated over
+     */
+    function __construct($index_bundle_iterator, $restrict_phrases,
+        $disallow_phrases, $weight = 1, $limit = 0)
+    {
+        $this->index_bundle_iterator = $index_bundle_iterator;
+        $this->limit = $limit;
+        $this->restrict_phrases = $restrict_phrases;
+        $this->disallow_phrases = $disallow_phrases;
+        $this->num_docs = $this->index_bundle_iterator->num_docs;
+        $this->weight = $weight;
+        $this->current_block_fresh = false;
+        $this->reset();
+    }
+
+    /**
+     * Returns the iterators to the first document block that it could iterate
+     * over
+     */
+    function reset()
+    {
+        $this->index_bundle_iterator->reset();
+        $this->seen_docs = 0;
+        $this->seen_docs_unfiltered = 0;
+        $beneath_limit = true;
+        while($beneath_limit == true) {
+            $doc_block = $this->currentDocsWithWord();
+            if($doc_block == -1 || !is_array($doc_block)) {
+                $beneath_limit = false;
+                continue;
+            }
+            if($this->seen_docs + $this->count_block > $this->limit) {
+                $beneath_limit = false;
+                continue;
+            }
+            $this->advance();
+        }
+    }
+
+    /**
+     * Hook function used by currentDocsWithWord to return the current block
+     * of docs if it is not cached
+     *
+     * @return mixed doc ids and score if there are docs left, -1 otherwise
+     */
+    function findDocsWithWord()
+    {
+        $pages = $this->index_bundle_iterator->getSummariesFromCurrentDocs();
+        $this->count_block_unfiltered = count($pages);
+        if(!is_array($pages)) {
+            return $pages;
+        }
+
+        $out_pages = array();
+        if(count($pages) > 0 ) {
+            foreach($pages as $doc_key => $doc_info) {
+                if(isset($doc_info[self::SUMMARY_OFFSET])) {
+                    /*
+                        if have SUMMARY_OFFSET then should have tried to get
+                        TITLE, etc.
+                    */
+                    $page_string =
+                        PhraseParser::extractWordStringPageSummary(
+                            $doc_info[self::SUMMARY]);
+
+                    $found = true;
+
+                    if($this->restrict_phrases != NULL) {
+                        foreach($this->restrict_phrases as $pre_phrase) {
+                            $phrase_parts = explode("*", $pre_phrase);
+
+                            $phrase = "";
+                            $first= "";
+                            foreach($phrase_parts as $part) {;
+                                $phrase .= $first . preg_quote($part);
+                                $first= '(.)*';
+                            }
+
+                            if(strlen($phrase) > 0 &&
+                                mb_eregi($phrase, $page_string)  === false) {
+                                $found = false;
+                            }
+                        }
+                    }
+                    if($this->disallow_phrases != NULL &&
+                        is_array($this->disallow_phrases)) {
+                        foreach($this->disallow_phrases as $phrase) {
+                            if(strlen($phrase) > 0 &&
+                                mb_eregi($phrase, $page_string)  !== false) {
+                                $found = false;
+                            }
+                        }
+                    }
+                    if($found == true) {
+                        $doc_info["WEIGHT"] = $this->weight;
+                        $doc_info[self::SCORE] *= $this->weight;
+                        $out_pages[$doc_key] = $doc_info;
+                    }
+                }
+            }
+            $pages = $out_pages;
+        }
+        $this->count_block = count($pages);
+
+        if($this->seen_docs < $this->limit) {
+            $total_docs = $this->seen_docs + $this->count_block;
+            if($total_docs <  $this->limit) {
+                $pages =array();
+            } else {
+                $pages = array_slice($pages,
+                    $this->limit - $this->seen_docs, NULL, true);
+            }
+        }
+        $this->summaries = $pages;
+        $this->pages = array();
+        foreach($pages as $doc_key => $doc_info) {
+            $this->pages[$doc_key] = $doc_info;
+            unset($this->pages[$doc_key][self::SUMMARY]);
+        }
+        return $pages;
+
+    }
+
+    /**
+     * Gets the summaries associated with the keys provided the keys
+     * can be found in the current block of docs returned by this iterator
+     * @param array $keys keys to try to find in the current block of returned
+     *      results
+     * @return array doc summaries that match provided keys
+     */
+    function getSummariesFromCurrentDocs($keys = NULL)
+    {
+        if($this->current_block_fresh == false) {
+            $result = $this->currentDocsWithWord();
+            if(!is_array($result)) {
+                return $result;
+            }
+        }
+        if(!is_array($this->pages)) {
+            return $this->pages;
+        }
+        if($keys == NULL) {
+            $keys = array_keys($this->pages);
+        }
+        $out_pages = array();
+        foreach($keys as $doc_key) {
+            if(!isset($this->summaries[$doc_key])) {
+                continue;
+            } else {
+                $out_pages[$doc_key] = $this->summaries[$doc_key];
+            }
+        }
+        return $out_pages;
+    }
+
+
+    /**
+     * Forwards the iterator one group of docs
+     */
+    function advance()
+    {
+        $this->advanceSeenDocs();
+
+
+        	$this->seen_docs_unfiltered += $this->count_block_unfiltered;
+
+
+
+        if($this->seen_docs_unfiltered > 0) {
+            $this->num_docs =
+                floor(($this->seen_docs*$this->index_bundle_iterator->num_docs)/
+                $this->seen_docs_unfiltered);
+        } else {
+            $this->num_docs = 0;
+        }
+
+        $this->index_bundle_iterator->advance();
+    }
+
+    /**
+     * Returns the index associated with this iterator
+     * @return object the index
+     */
+    function getIndex($key = NULL)
+    {
+        return $this->index_bundle_iterator->getIndex($key = NULL);
+    }
+}
+?>
diff --git a/lib/index_bundle_iterators/union_iterator.php b/lib/index_bundle_iterators/union_iterator.php
new file mode 100644
index 000000000..4c8ae8101
--- /dev/null
+++ b/lib/index_bundle_iterators/union_iterator.php
@@ -0,0 +1,260 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage library
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ *Loads common constants for word indexing
+ */
+require_once BASE_DIR.'/lib/indexing_constants.php';
+
+/**
+ *Loads BloomFilterFile to remember things we've already grouped
+ */
+require_once BASE_DIR.'/lib/bloom_filter_file.php';
+
+
+/**
+ *Loads base class for iterating
+ */
+require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
+
+/**
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage library
+ * @see IndexArchiveBundle
+ */
+class UnionIterator extends IndexBundleIterator
+{
+    /**
+     * An array of iterators whose interection we  get documents from
+     * @var array
+     */
+    var $index_bundle_iterators;
+    /**
+     * Number of elements in $this->index_bundle_iterators
+     * @var int
+     */
+    var $num_iterators;
+
+    /**
+     * The number of documents in the current block before filtering
+     * by restricted words
+     * @var int
+     */
+    var $count_block_unfiltered;
+    /**
+     * The number of documents in the current block after filtering
+     * by restricted words
+     * @var int
+     */
+    var $count_block;
+
+    /**
+     * The number of iterated docs before the restriction test
+     * @var int
+     */
+    var $seen_docs_unfiltered;
+
+
+    /**
+     * Creates a union iterator with the given parameters.
+     *
+     * @param object $index_bundle_iterator to use as a source of documents
+     *      to iterate over
+     * @param int $limit the first element to return from the list of docs
+     *      iterated over
+     */
+    function __construct($index_bundle_iterators, $limit = 0)
+    {
+        $this->index_bundle_iterators = $index_bundle_iterators;
+        $this->limit = $limit;
+        /*
+            estimate number of results by sum of all iterator counts,
+            then improve estimate as iterate
+        */
+        $this->num_iterators = count($index_bundle_iterators);
+        $this->num_docs = 0;
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            $this->num_docs += $this->index_bundle_iterators[$i]->num_docs;
+        }
+        $this->reset();
+    }
+
+    /**
+     * Returns the iterators to the first document block that it could iterate
+     * over
+     */
+    function reset()
+    {
+        foreach($this->index_bundle_iterators as $iterator) {
+            $iterator->reset();
+        }
+
+        $this->seen_docs = 0;
+        $this->seen_docs_unfiltered = 0;
+        $beneath_limit = true;
+        while($beneath_limit == true) {
+            $doc_block = $this->currentDocsWithWord();
+            if($doc_block == -1 || !is_array($doc_block)) {
+                $beneath_limit = false;
+                continue;
+            }
+            if($this->seen_docs + $this->count_block >= $this->limit) {
+                $beneath_limit = false;
+                continue;
+            }
+            $this->advance();
+        }
+    }
+
+    /**
+     * Hook function used by currentDocsWithWord to return the current block
+     * of docs if it is not cached
+     *
+     * @return mixed doc ids and score if there are docs left, -1 otherwise
+     */
+    function findDocsWithWord()
+    {
+        $pages = array();
+        $docs = array();
+        $high_score = array();
+        $high_score = array();
+        $found_docs = false;
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            $docs =  $this->index_bundle_iterators[$i]->currentDocsWithWord();
+            if(is_array($docs)) {
+                $doc_keys = array_keys($docs);
+                foreach($doc_keys as $key) {
+                    $docs[$key]["ITERATOR"] = $i;
+                }
+                $pages = array_merge($pages, $docs);
+                $found_docs = true;
+            }
+
+        }
+        if($found_docs == false) {
+            $this->pages = $docs;
+            return $docs;
+        }
+        $this->count_block_unfiltered = count($pages);
+        $this->pages = $pages;
+        $this->count_block = count($pages);
+        return $pages;
+    }
+
+    /**
+     * Gets the summaries associated with the keys provided the keys
+     * can be found in the current block of docs returned by this iterator
+     * @param array $keys keys to try to find in the current block of returned
+     *      results
+     * @return array doc summaries that match provided keys
+     */
+    function getSummariesFromCurrentDocs($keys = NULL)
+    {
+        if($this->current_block_fresh == false) {
+            $result = $this->currentDocsWithWord();
+            if(!is_array($result)) {
+                return $result;
+            }
+        }
+        if(!is_array($this->pages)) {
+            return $this->pages;
+        }
+        if($keys == NULL) {
+            $keys = array_keys($this->pages);
+        }
+        $out_pages = array();
+        echo "hello".$this->pages[$key[0]]["ITERATOR"]."<br/>";
+        foreach($keys as $doc_key) {
+            if(!isset($this->pages[$doc_key]["ITERATOR"])) {
+                continue;
+            } else {
+                $out_pages[$doc_key] = $this->index_bundle_iterators[
+                    $this->pages[
+                        $doc_key]["ITERATOR"]]->getSummariesFromCurrentDocs(
+                            array($doc_key));
+            }
+        }
+        return $out_pages;
+    }
+
+    /**
+     * Forwards the iterator one group of docs
+     */
+    function advance()
+    {
+        $this->advanceSeenDocs();
+
+        	$this->seen_docs_unfiltered += $this->count_block_unfiltered;
+
+        $total_num_docs = 0;
+        for($i = 0; $i < $this->num_iterators; $i++) {
+            $total_num_docs += $this->index_bundle_iterators[$i]->num_docs;
+            $this->index_bundle_iterators[$i]->advance();
+        }
+        if($this->seen_docs_unfiltered > 0) {
+            $this->num_docs =
+                floor(($this->seen_docs * $total_num_docs) /
+                $this->seen_docs_unfiltered);
+        } else {
+            $this->num_docs = 0;
+        }
+    }
+
+    /**
+     * Returns the index associated with this iterator
+     * @return object the index
+     */
+    function getIndex($key = NULL)
+    {
+        if($key != NULL) {
+            if($this->current_block_fresh == false) {
+                $result = $this->currentDocsWithWord();
+                if(!is_array($result)) {
+                    return $this->index_bundle_iterators[0]->getIndex($key);
+                }
+            }
+            if(!isset($this->pages[$key]["ITERATOR"])) {
+                return $this->index_bundle_iterators[0]->getIndex($key);
+            }
+            return $this->index_bundle_iterators[
+                $this->pages[$key]["ITERATOR"]]->getIndex($key);
+        } else {
+            return $this->index_bundle_iterators[0]->getIndex($key);
+        }
+    }
+}
+?>
diff --git a/lib/index_bundle_iterators/word_iterator.php b/lib/index_bundle_iterators/word_iterator.php
new file mode 100644
index 000000000..7512d2a4a
--- /dev/null
+++ b/lib/index_bundle_iterators/word_iterator.php
@@ -0,0 +1,399 @@
+<?php
+/**
+ *  SeekQuarry/Yioop --
+ *  Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ *  Copyright (C) 2009, 2010  Chris Pollett chris@pollett.org
+ *
+ *  LICENSE:
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  END LICENSE
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @package seek_quarry
+ * @subpackage library
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009, 2010
+ * @filesource
+ */
+
+if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
+
+/**
+ *Loads common constants for word indexing
+ */
+require_once BASE_DIR.'/lib/indexing_constants.php';
+
+/**
+ *Loads base class for iterating
+ */
+require_once BASE_DIR.'/lib/index_bundle_iterators/index_bundle_iterator.php';
+
+/**
+ * Used to iterate through the documents associated with a word in
+ * an IndexArchiveBundle. It also makes it easy to get the summaries
+ * of these documents.
+ *
+ * A description of how words and the documents containing them are stored
+ * is given in the documentation of IndexArchiveBundle. To iterate over
+ * all documents containng a word, its hash, work_key, is formed. Then using
+ * the Bloom filter for that partition, it is determined if the word is stored
+ * at all, and if it is, which generations it occurs in. Then the iterator
+ * is set to point to the first block of the first generation the word appears
+ * in that is greater than the limit of the WordIterator. Thereafter,
+ * nextDocsWithWord will advance $this->current_pointer by one per call.
+ * $this->current_pointer keeps track of which block of documents containing
+ * the word to return. If it is less than COMMON_WORD_THRESHOLD/BLOCK_SIZE and
+ * there are still more blocks, then the corresponding block_pointer of the word
+ * from the generation's partition info_block is used to look up the offset to
+ * the doc block. If it is greater than this value then the linked list
+ * of doc blocks pointed to for the partition is followed to get the appropriate
+ * block. This list is in the order that words were stored in the index so
+ * LIST_OFFSET points to the last block stored, which in turn points to the
+ * next to last block, etc. Finally, when all the blocks in the linked-list are
+ * exhausted, the remaining docs for that generation for that word are stored
+ * in the info block for the word itself (this will always be less than
+ * BLOCK_SIZE many). Once all the docs for a word for a generation have been
+ * iterated through, than iteration proceeds to the next generation containing
+ * the word.
+ *
+ * @author Chris Pollett
+ * @package seek_quarry
+ * @subpackage library
+ * @see IndexArchiveBundle
+ */
+class WordIterator extends IndexBundleIterator
+{
+    /**
+     * hash of word that the iterator iterates over
+     * @var string
+     */
+    var $word_key;
+    /**
+     * The IndexArchiveBundle this index is associated with
+     * @var object
+     */
+    var $index;
+
+    /**
+     * If iterating through the linked-list portions of the documents
+     * the next byte offset in the WebArchive based linked-list
+     * @var int
+     */
+    var $next_offset;
+    /**
+     * Block number of the last block of docs
+     * @var int
+     */
+    var $last_pointed_block;
+    /**
+     * @var int
+     */
+    var $list_offset;
+
+    /**
+     * Pointers to offsets for blocks containing docs with the given word
+     * for the current generation
+     * @var array
+     */
+    var $block_pointers;
+    /**
+     * Number of completely full blocks of documents for the current generation
+     * @var int
+     */
+    var $num_full_blocks;
+    /**
+     * Number of generations word appears in
+     * @var int
+     */
+    var $num_generations;
+    /**
+     * Used to store the contents of the last partially full block
+     * @var int
+     */
+    var $last_block;
+    /**
+     * the info block of the WebArchive that the word lives in
+     * @var object
+     */
+    var $info_block;
+    /**
+     * Stores the number of the current block of documents we are at in the
+     * set of all blocks of BLOCK_SIZE many documents
+     * @var int
+     */
+    var $current_pointer;
+
+    /**
+     * Creates a word iterator with the given parameters.
+     *
+     * @param string $word_key hash of word or phrase to iterate docs of
+     * @param object $index the IndexArchiveBundle to use
+     * @param int $limit the first element to return from the list of docs
+     *      iterated over
+     * @param object $info_block the info block of the WebArchive
+     *      associated with the word in the index. If NULL, then this will
+     *      loaded in WordIterator::reset()
+     */
+    function __construct($word_key, $index, $limit = 0, $info_block = NULL)
+    {
+        $this->word_key = $word_key;
+        $this->index = $index;
+        $this->limit = $limit;
+        $this->info_block = $info_block;
+        $this->current_block_fresh = false;
+        $this->reset();
+    }
+
+    /**
+     * Returns the iterators to the first document block that it could iterate
+     * over
+     *
+     */
+    function reset()
+    {
+        $this->count_block = 0;
+        $this->seen_docs = 0;
+
+        $partition =
+            WebArchiveBundle::selectPartition($this->word_key,
+                $this->index->num_partitions_index);
+        if($this->info_block == NULL) {
+            $this->info_block =
+                $this->index->getPhraseIndexInfo($this->word_key);
+        }
+        if($this->info_block !== NULL) {
+            $this->num_generations = count($this->info_block['GENERATIONS']);
+            $count_till_generation = $this->info_block[self::COUNT];
+
+            while($this->limit >= $count_till_generation) {
+                $this->info_block['CURRENT_GENERATION_INDEX']++;
+                if($this->num_generations <=
+                    $this->info_block['CURRENT_GENERATION_INDEX']) {
+                    $this->num_docs = 0;
+                    $this->current_pointer = -1;
+                    return;
+                }
+                $info_block = $this->index->getPhraseIndexInfo(
+                    $this->word_key,
+                    $this->info_block['CURRENT_GENERATION_INDEX'],
+                    $this->info_block);
+                if($info_block !== NULL) {
+                    $this->info_block = $info_block;
+                }
+                $count_till_generation += $this->info_block[self::COUNT];
+            }
+            $this->seen_docs = $count_till_generation -
+                $this->info_block[self::COUNT];
+
+        }
+
+
+        $this->initGeneration();
+
+
+    }
+
+    /**
+     * Sets up the iterator to iterate through the current generation.
+     *
+     * @return bool whether the initialization succeeds
+     */
+    function initGeneration()
+    {
+
+        if($this->info_block !== NULL) {
+            $info_block = $this->index->getPhraseIndexInfo(
+                $this->word_key, $this->info_block['CURRENT_GENERATION_INDEX'],
+                $this->info_block);
+            if($info_block === NULL) {
+                return false;
+            }
+            $this->info_block = & $info_block;
+            $this->num_docs = $info_block['TOTAL_COUNT'];
+            $this->num_docs_generation = $info_block[self::COUNT];
+
+            $this->current_pointer =
+                max(floor(($this->limit - $this->seen_docs) / BLOCK_SIZE), 0);
+            $this->seen_docs += $this->current_pointer*BLOCK_SIZE;
+            $this->last_block = $info_block[self::END_BLOCK];
+            $this->num_full_blocks =
+                floor($this->num_docs_generation / BLOCK_SIZE);
+            if($this->num_docs_generation > COMMON_WORD_THRESHOLD) {
+                $this->last_pointed_block =
+                    floor(COMMON_WORD_THRESHOLD / BLOCK_SIZE);
+            } else {
+                $this->last_pointed_block = $this->num_full_blocks;
+            }
+
+            for($i = 0; $i < $this->last_pointed_block; $i++) {
+                if(isset($info_block[$i])) {
+                    $this->block_pointers[$i] = $info_block[$i];
+                }
+            }
+
+            if($this->num_docs_generation > COMMON_WORD_THRESHOLD) {
+                if($info_block[self::LIST_OFFSET] === NULL) {
+                    $this->list_offset = NULL;
+                } else {
+                    $this->list_offset = $info_block[self::LIST_OFFSET];
+                }
+            }
+
+        } else {
+            $this->num_docs = 0;
+            $this->num_docs_generation = 0;
+            $this->current_pointer = -1;
+        }
+        return true;
+    }
+
+    /**
+     * Hook function used by currentDocsWithWord to return the current block
+     * of docs if it is not cached
+     *
+     * @return mixed doc ids and score if there are docs left, -1 otherwise
+     */
+    function findDocsWithWord()
+    {
+        if($this->num_generations <=
+            $this->info_block['CURRENT_GENERATION_INDEX']) {
+            $this->pages = NULL;
+            return -1;
+        }
+        $generation =
+            $this->info_block['GENERATIONS'][
+                $this->info_block['CURRENT_GENERATION_INDEX']];
+        if($this->current_pointer >= 0) {
+            if($this->current_pointer == $this->num_full_blocks) {
+                $pages = $this->last_block;
+            } else if ($this->current_pointer >= $this->last_pointed_block) {
+                /* if there are more than COMMON_WORD_THRESHOLD many
+                   results and we're not at the last block yet
+                 */
+                if($this->list_offset === NULL) {
+                    $this->pages = NULL;
+                    return -1;
+                }
+                $offset = $this->list_offset;
+                $found = false;
+                do {
+                    /* the link list is actually backwards to the order we want
+                       For now, we cycle along the list from the last data
+                       stored until we find the block we want. This is slow
+                       but we are relying on the fact that each generation is
+                       not too big.
+                     */
+                    $doc_block = $this->index->getWordDocBlock($this->word_key,
+                        $offset, $generation);
+                    $word_keys = array_keys($doc_block);
+                    $found_key = NULL;
+                    foreach($word_keys as $word_key) {
+                        if(strstr($word_key, $this->word_key.":")) {
+                            $found_key = $word_key;
+                            if(isset($doc_block[
+                                $found_key][self::LIST_OFFSET])) {
+                                //only one list offset/docblock
+                                break;
+                            }
+                        }
+                    }
+                    if($found_key === NULL) {
+                        break;
+                    }
+                    if(isset($doc_block[
+                        $this->word_key.":".$this->current_pointer])) {
+                        $found = true;
+                        break;
+                    }
+                    $offset = $doc_block[$found_key][self::LIST_OFFSET];
+                } while($offset != NULL);
+                if($found != true) {
+                    $pages = array();
+                } else {
+                    $pages = & $doc_block[
+                        $this->word_key.":".$this->current_pointer];
+                }
+            } else {
+                //first COMMON_WORD_THRESHOLD many results fast
+                if(isset($this->block_pointers[$this->current_pointer])) {
+                    $doc_block = $this->index->getWordDocBlock($this->word_key,
+                        $this->block_pointers[$this->current_pointer],
+                        $generation);
+                    if(isset(
+                        $doc_block[$this->word_key.":".$this->current_pointer]
+                        )) {
+                        $pages = &
+                            $doc_block[
+                                $this->word_key.":".$this->current_pointer];
+                    } else {
+                        $pages = array();
+                    }
+                } else {
+                    $pages = array();
+                }
+            }
+
+            if($this->seen_docs < $this->limit) {
+                $diff_offset = $this->limit - $this->seen_docs;
+
+                $pages = array_slice($pages, $diff_offset);
+            }
+            $this->pages = & $pages;
+            $this->count_block = count($pages);
+            return $pages;
+        } else {
+            $this->pages = NULL;
+            return -1;
+        }
+    }
+
+
+    /**
+     * Forwards the iterator one group of docs
+     */
+    function advance()
+    {
+        if($this->current_pointer < 0) {return;}
+
+        $this->advanceSeenDocs();
+
+        $this->current_pointer ++;
+        if($this->current_pointer > $this->num_full_blocks) {
+            $flag = false;
+            while ($this->info_block['CURRENT_GENERATION_INDEX'] <
+                $this->num_generations - 1 && !$flag) {
+                $this->info_block['CURRENT_GENERATION_INDEX']++;
+                $flag = $this->initGeneration();
+            }
+            if ($this->info_block['CURRENT_GENERATION_INDEX'] >=
+                $this->num_generations - 1) {
+                $this->current_pointer = - 1;
+            }
+        }
+    }
+
+    /**
+     * Returns the index associated with this iterator
+     * @return object the index
+     */
+    function getIndex($key = NULL)
+    {
+        return $this->index;
+    }
+}
+?>
diff --git a/lib/persistent_structure.php b/lib/persistent_structure.php
index 3f6f7c3b6..40594e750 100755
--- a/lib/persistent_structure.php
+++ b/lib/persistent_structure.php
@@ -60,7 +60,7 @@ class PersistentStructure
      *  @var int
      */
     var $unsaved_operations;
-    /** Number of operation between saves
+    /** Number of operation between saves. If == -1 never save
      *  @var int
      */
     var $save_frequency;
@@ -71,7 +71,8 @@ class PersistentStructure
      *
      * @param string $fname the name of the file to store the
      *      PersistentStructure in
-     * @param int $save_frequency the number of operation before a save
+     * @param int $save_frequency the number of operation before a save If
+     *      <= 0 never save
      */
     public function __construct($fname,
         $save_frequency = self::DEFAULT_SAVE_FREQUENCY)
@@ -107,7 +108,8 @@ class PersistentStructure
     function checkSave()
     {
         $this->unsaved_operations++;
-        if($this->unsaved_operations >= $this->save_frequency) {
+        if($this->save_frequency > 0 &&
+            $this->unsaved_operations >= $this->save_frequency) {
             $this->save();
             $this->unsaved_operations = 0;
         }
diff --git a/lib/phrase_parser.php b/lib/phrase_parser.php
index c22b1b664..967859823 100755
--- a/lib/phrase_parser.php
+++ b/lib/phrase_parser.php
@@ -67,17 +67,8 @@ class PhraseParser
             $page[CrawlConstants::TITLE]);
         $description_phrase_string = mb_ereg_replace("[[:punct:]]", " ",
             $page[CrawlConstants::DESCRIPTION]);
-        $link_phrase_string = "";
-        $link_urls = array();

-        foreach($page[CrawlConstants::LINKS] as $url => $link_text) {
-            $link_phrase_string .= " $link_text";
-        }
-
-        $link_phrase_string = mb_ereg_replace("[[:punct:]]", " ",
-            $link_phrase_string);
-        $page_string = $title_phrase_string . " " . $description_phrase_string .
-            " " . $link_phrase_string;
+        $page_string = $title_phrase_string . " " . $description_phrase_string;
         $page_string = preg_replace("/(\s)+/", " ", $page_string);

         return $page_string;
diff --git a/lib/processors/html_processor.php b/lib/processors/html_processor.php
index bc27150f3..1b18d3448 100755
--- a/lib/processors/html_processor.php
+++ b/lib/processors/html_processor.php
@@ -52,7 +52,7 @@ require_once BASE_DIR."/lib/url_parser.php";
  */
 class HtmlProcessor extends TextProcessor
 {
-    const MAX_DESCRIPTION_LEN = 3000;
+    const MAX_DESCRIPTION_LEN = 2000;


     /**
diff --git a/lib/processors/image_processor.php b/lib/processors/image_processor.php
index 009e6917b..fbe9ec4be 100755
--- a/lib/processors/image_processor.php
+++ b/lib/processors/image_processor.php
@@ -73,7 +73,7 @@ abstract class ImageProcessor implements CrawlConstants
     static function createThumb($image)
     {
         $thumb = imagecreatetruecolor(50, 50);
-        if( isset($image) && $image == false ) {
+        if( isset($image) && $image !== false ) {
             $size_x = imagesx($image);
             $size_y = imagesy($image);

diff --git a/lib/url_parser.php b/lib/url_parser.php
index be5623dfd..dcf039ac5 100755
--- a/lib/url_parser.php
+++ b/lib/url_parser.php
@@ -114,7 +114,7 @@ class UrlParser
      *  @param string $url the url to parse
      *  @return the host portion of the url if present; NULL otherwise
      */
-    public static function getPath($url)
+    static function getPath($url)
     {
         $url_parts = @parse_url($url);
         if(!isset($url_parts['path'])) {
@@ -134,7 +134,7 @@ class UrlParser
      * @param string $url the url to extract prefixes from
      * @return array the array of url prefixes
      */
-    public static function getHostPaths($url)
+    static function getHostPaths($url)
     {
         $host_paths = array($url);

@@ -162,6 +162,34 @@ class UrlParser

     }

+    /**
+     * Gets the subdomains of the host portion of a url. So
+     *
+     * http://a.b.c/d/f/
+     * will return a.b.c, .a.b.c, b.c, .b.c, c, .c
+     *
+     * @param string $url the url to extract prefixes from
+     * @return array the array of url prefixes
+     */
+    static function getHostSubdomains($url)
+    {
+        $subdomains = array();
+        $url_parts = @parse_url($url);
+        if(strlen($url_parts['host']) <= 0) { return $subdomains; }
+        $host = $url_parts['host'];
+        $host_parts = explode(".", $host);
+        $num_parts = count($host_parts);
+        $domain = "";
+        for($i = $num_parts - 1; $i >= 0 ; $i--) {
+            $domain = $host_parts[$i].$domain;
+            $subdomains[] = $domain;
+            $domain = ".$domain";
+            $subdomains[] = $domain;
+        }
+
+        return $subdomains;
+    }
+
     /**
      * Given a url, makes a guess at the file type of the file it points to
      *
@@ -312,7 +340,13 @@ class UrlParser
             $path2 = str_replace("//","/", $path);
         } while($path != $path2);

-        $path = str_replace("/./","/", $path);
+        $path = str_replace("/./","/", $path);
+        if($path == "." || substr($path, -2) == "/.") {
+            $path = "/";
+        }
+        if($path == "") {
+            $path = "/";
+        }

         $url = $host.$path;

diff --git a/lib/utility.php b/lib/utility.php
index 55294e4a4..376f3cc08 100755
--- a/lib/utility.php
+++ b/lib/utility.php
@@ -215,6 +215,22 @@ function scoreOrderCallback($word_doc_a, $word_doc_b)
         (float)$word_doc_b[CrawlConstants::SCORE]) ? -1 : 1;
 }

+/**
+ *  Callback function used to sort documents by doc_rank
+ *
+ *  The function is used to sort documents being added to an IndexArchiveBundle
+ *
+ *  @param string $word_doc_a doc id of first document to compare
+ *  @param string $word_doc_b doc id of second document to compare
+ *  @return int -1 if first doc bigger 1 otherwise
+ *  @see IndexArchiveBundle::addPartitionWordData()
+ */
+function docRankOrderCallback($word_doc_a, $word_doc_b)
+{
+    return ((float)$word_doc_a[CrawlConstants::DOC_RANK] >
+        (float)$word_doc_b[CrawlConstants::DOC_RANK]) ? -1 : 1;
+}
+
 /**
  * Callback to check if $a is less than $b
  *
diff --git a/lib/web_archive_bundle.php b/lib/web_archive_bundle.php
index 4f56f6b77..7909e0d29 100755
--- a/lib/web_archive_bundle.php
+++ b/lib/web_archive_bundle.php
@@ -145,7 +145,7 @@ class WebArchiveBundle
         }

         //store/read archive description
-        $info = NULL;
+
         if(file_exists($dir_name."/description.txt")) {
             $info = unserialize(
                 file_get_contents($this->dir_name."/description.txt"));
@@ -170,8 +170,6 @@ class WebArchiveBundle
             }
         }

-        $info = array();
-
         $info['DESCRIPTION'] = $this->description;
         $info['NUM_PARTITIONS'] = $this->num_partitions;
         $info['COUNT'] = $this->count;
@@ -418,17 +416,36 @@ class WebArchiveBundle
         return $this->partition[$index];
     }

+    /**
+     * Creates a new counter to be maintain in the description.txt
+     * file if the counter doesn't exist, leaves unchanged otherwise
+     *
+     * @param string $field field of info struct to add a counter for
+     */
+    function initCountIfNotExists($field = "COUNT")
+    {
+        $info =
+            unserialize(file_get_contents($this->dir_name."/description.txt"));
+        if(!isset($info[$field])) {
+            $info[$field] = 0;
+        }
+        file_put_contents($this->dir_name."/description.txt", serialize($info));
+    }
+
     /**
      * Updates the description file with the current count for the number of
-     * items in the WebArchiveBundle
+     * items in the WebArchiveBundle. If the $field item is used counts of
+     * additional properties (visited urls say versus total urls) can be
+     * maintained.
      *
      * @param int $num number of items to add to current count
+     * @param string $field field of info struct to add to the count of
      */
-    function addCount($num)
+    function addCount($num, $field = "COUNT")
     {
         $info =
             unserialize(file_get_contents($this->dir_name."/description.txt"));
-        $info['COUNT'] += $num;
+        $info[$field] += $num;
         file_put_contents($this->dir_name."/description.txt", serialize($info));
     }

diff --git a/locale/en-US/configure.ini b/locale/en-US/configure.ini
index 265df66b2..4b38bc799 100755
--- a/locale/en-US/configure.ini
+++ b/locale/en-US/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = "Please Describe Your Robot"
 ; search_controller.php line: 119
 search_controller_logout_successful = "Logout Successful!!"
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = "This cached version of %s was obtained by the Yioop crawler on %s."
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = "Time started:"
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = "No start time found"
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = "Total Urls Seen:"
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = "Visited Urls Count:"
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = "Total Urls Extracted:"
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = "Most Recent Fetcher:"
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = "No Fetcher Queries Yet"
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = "Most Recent Urls"
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = "No Recent Urls"
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = "Previous Crawls"
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = "Description:"
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = "Time started:"
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = "Total Urls Seen:"
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = "Visited/Extracted Urls"
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = "Actions:"
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = "Resume"
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = "Set as Index"
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = "Search Index"
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = "Delete"
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = "No Previous Crawls"
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = "Rel: %s "
 ; search_view.php line: 133
 search_view_score = "Score %s"
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = "Cached"
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = "View&nbsp;as&nbsp;text"
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = "Similar"
 ;
+; search_view.php line: 157
+search_view_inlink = "Inlinks"
+;
 ; settings_view.php line: 76
 settings_view_settings = "Settings"
 ;
diff --git a/locale/en-US/statistics.txt b/locale/en-US/statistics.txt
index 5a165df53..b6bef56f0 100755
--- a/locale/en-US/statistics.txt
+++ b/locale/en-US/statistics.txt
@@ -1 +1 @@
-d:100;
\ No newline at end of file
+d:99;
\ No newline at end of file
diff --git a/locale/fr-FR/configure.ini b/locale/fr-FR/configure.ini
index 251692d5c..6d7c9acab 100755
--- a/locale/fr-FR/configure.ini
+++ b/locale/fr-FR/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = ""
 ; search_controller.php line: 119
 search_controller_logout_successful = ""
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = ""
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = ""
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = ""
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = ""
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = ""
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = ""
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = ""
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = ""
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = ""
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = ""
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = ""
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = ""
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = ""
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = ""
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = ""
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = ""
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = ""
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = ""
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = ""
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = "Pertinence: %s"
 ; search_view.php line: 133
 search_view_score = "Total: %s"
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = "En&nbsp;Cache"
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = "Version&nbsp;texte"
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = "Pages&nbsp;similaires"
 ;
+; search_view.php line: 157
+search_view_inlink = ""
+;
 ; settings_view.php line: 76
 settings_view_settings = "Pr&eacute;f&eacute;rences"
 ;
diff --git a/locale/he/configure.ini b/locale/he/configure.ini
index 07c05e0f1..555e6e092 100755
--- a/locale/he/configure.ini
+++ b/locale/he/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = ""
 ; search_controller.php line: 119
 search_controller_logout_successful = ""
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = ""
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = ""
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = ""
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = ""
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = ""
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = ""
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = ""
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = ""
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = ""
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = ""
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = ""
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = ""
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = ""
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = ""
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = ""
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = ""
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = ""
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = ""
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = ""
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = ""
 ; search_view.php line: 133
 search_view_score = ""
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = ""
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = ""
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = ""
 ;
+; search_view.php line: 157
+search_view_inlink = ""
+;
 ; settings_view.php line: 76
 settings_view_settings = "הגדרות"
 ;
diff --git a/locale/in-ID/configure.ini b/locale/in-ID/configure.ini
index 16a90626b..dd9744670 100755
--- a/locale/in-ID/configure.ini
+++ b/locale/in-ID/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = ""
 ; search_controller.php line: 119
 search_controller_logout_successful = "Logout berhasil"
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = ""
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = ""
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = ""
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = ""
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = ""
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = ""
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = ""
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = ""
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = ""
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = ""
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = ""
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = ""
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = ""
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = ""
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = ""
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = ""
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = ""
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = ""
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = ""
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = ""
 ; search_view.php line: 133
 search_view_score = ""
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = ""
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = ""
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = ""
 ;
+; search_view.php line: 157
+search_view_inlink = ""
+;
 ; settings_view.php line: 76
 settings_view_settings = ""
 ;
diff --git a/locale/ja/configure.ini b/locale/ja/configure.ini
index 01eb7f43f..4da074c08 100755
--- a/locale/ja/configure.ini
+++ b/locale/ja/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = "ロボットの説明してください。"
 ; search_controller.php line: 119
 search_controller_logout_successful = "ログアウト成功"
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = "％sのこのキャッシュされたバージョンは％sのウィオップから入手しました。"
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = "始まった時の時間"
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = "検索始まった時間は見つけない"
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = "全部URL"
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = ""
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = "全部URL"
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = "最新フェッチャ"
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = "フェッチャキュエリはまだありません"
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = "最新URL"
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = "最近URLはありません"
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = "さっきの検索"
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = "説明"
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = "始まった時の時間"
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = "全部URL"
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = ""
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = "アクション"
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = "再会"
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = "指数の設定する。"
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = "検索指数"
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = "削除"
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = "さっきの検索はありません"
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = "関連：％s"
 ; search_view.php line: 133
 search_view_score = "スコア　％s"
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = "キャッシューしました。"
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = "テクストビュー"
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = "同じビュー"
 ;
+; search_view.php line: 157
+search_view_inlink = ""
+;
 ; settings_view.php line: 76
 settings_view_settings = "設定"
 ;
diff --git a/locale/ja/statistics.txt b/locale/ja/statistics.txt
index 5a165df53..eedae9b06 100755
--- a/locale/ja/statistics.txt
+++ b/locale/ja/statistics.txt
@@ -1 +1 @@
-d:100;
\ No newline at end of file
+d:98;
\ No newline at end of file
diff --git a/locale/ko/configure.ini b/locale/ko/configure.ini
index e6d3569e9..81951938c 100755
--- a/locale/ko/configure.ini
+++ b/locale/ko/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = ""
 ; search_controller.php line: 119
 search_controller_logout_successful = "로그 아웃 성공!!"
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = "현재 캐시 버젼 %s 은 Yioop 크롤 %s 에 의하여 얻어 졌습니다. "
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = "시작한 시간:"
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = "시작 시간이 존재하지 않습니다."
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = "지금까지 본 총 합계 주소(URLs):"
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = ""
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = "지금까지 본 총 합계 주소(URLs):"
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = ""
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = ""
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = ""
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = ""
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = ""
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = "설명:"
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = "시작한 시간:"
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = "지금까지 본 총 합계 주소(URLs):"
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = ""
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = ""
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = ""
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = ""
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = ""
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = ""
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = ""
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = "관련성: %s "
 ; search_view.php line: 133
 search_view_score = "점수 %s"
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = "캐시 됀것"
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = "일반 텍스트로써 보기"
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = "유사성"
 ;
+; search_view.php line: 157
+search_view_inlink = ""
+;
 ; settings_view.php line: 76
 settings_view_settings = "세팅"
 ;
diff --git a/locale/ko/statistics.txt b/locale/ko/statistics.txt
index 187cb44f0..b26155324 100755
--- a/locale/ko/statistics.txt
+++ b/locale/ko/statistics.txt
@@ -1 +1 @@
-d:32;
\ No newline at end of file
+d:31;
\ No newline at end of file
diff --git a/locale/rn-US/configure.ini b/locale/rn-US/configure.ini
index 03f846c9d..63b95bb17 100755
--- a/locale/rn-US/configure.ini
+++ b/locale/rn-US/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = ""
 ; search_controller.php line: 119
 search_controller_logout_successful = "Logout Successful!"
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = ""
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = ""
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = ""
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = ""
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = ""
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = ""
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = ""
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = ""
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = ""
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = ""
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = ""
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = ""
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = ""
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = ""
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = ""
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = ""
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = ""
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = ""
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = ""
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = ""
 ; search_view.php line: 133
 search_view_score = ""
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = ""
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = ""
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = ""
 ;
+; search_view.php line: 157
+search_view_inlink = ""
+;
 ; settings_view.php line: 76
 settings_view_settings = ""
 ;
diff --git a/locale/th/configure.ini b/locale/th/configure.ini
index e255988bf..123f3d571 100755
--- a/locale/th/configure.ini
+++ b/locale/th/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = ""
 ; search_controller.php line: 119
 search_controller_logout_successful = ""
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = ""
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = ""
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = ""
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = ""
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = ""
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = ""
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = ""
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = ""
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = ""
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = ""
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = ""
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = ""
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = ""
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = ""
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = ""
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = ""
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = ""
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = ""
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = ""
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = ""
 ; search_view.php line: 133
 search_view_score = ""
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = ""
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = ""
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = ""
 ;
+; search_view.php line: 157
+search_view_inlink = ""
+;
 ; settings_view.php line: 76
 settings_view_settings = ""
 ;
diff --git a/locale/vi-VN/configure.ini b/locale/vi-VN/configure.ini
index e70ca3884..39dbad5a2 100755
--- a/locale/vi-VN/configure.ini
+++ b/locale/vi-VN/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = "Diễn tả r&ocirc; b&ocirc; của bạn"
 ; search_controller.php line: 119
 search_controller_logout_successful = "Tho&aacute;t th&agrave;nh c&ocirc;ng"
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = "Trang gốc n&agrave;y: %s đ&atilde; t&igrave;m được bởi c&ocirc;ng cụ t&igrave;m kiẽm Yioop v&agrave;o ng&agrave;y %s."
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = "Thời gian bắt đầu:"
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = "Kh&ocirc;ng t&igrave;m thấy thời gian bắt đầu"
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = ""
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = ""
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = ""
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = ""
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = ""
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = ""
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = ""
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = "M&ocirc; tả:"
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = "Thời gian bắt đầu:"
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = ""
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = "Những h&agrave;nh động:"
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = "Bắt đầu trở lại"
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = "C&agrave;i l&agrave;m mục lục"
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = "T&igrave;m mục lục"
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = "Xo&aacute;"
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = ""
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = "Th&iacute;ch hợp:"
 ; search_view.php line: 133
 search_view_score = "Điểm: %s"
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = "Trang&nbsp;gốc"
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = "Trang&nbsp;Web&nbsp;Bắng Chữ"
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = "Tương&nbsp;Tự"
 ;
+; search_view.php line: 157
+search_view_inlink = ""
+;
 ; settings_view.php line: 76
 settings_view_settings = "Sự sắp đặt"
 ;
diff --git a/locale/vi-VN/statistics.txt b/locale/vi-VN/statistics.txt
index 77bbfe053..2c43a0adb 100755
--- a/locale/vi-VN/statistics.txt
+++ b/locale/vi-VN/statistics.txt
@@ -1 +1 @@
-d:74;
\ No newline at end of file
+d:73;
\ No newline at end of file
diff --git a/locale/vn-US/configure.ini b/locale/vn-US/configure.ini
index 7908df79c..ad67134e6 100755
--- a/locale/vn-US/configure.ini
+++ b/locale/vn-US/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = ""
 ; search_controller.php line: 119
 search_controller_logout_successful = ""
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = ""
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = ""
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = ""
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = ""
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = ""
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = ""
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = ""
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = ""
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = ""
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = ""
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = ""
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = ""
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = ""
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = ""
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = ""
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = ""
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = ""
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = ""
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = ""
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = ""
 ; search_view.php line: 133
 search_view_score = ""
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = ""
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = ""
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = ""
 ;
+; search_view.php line: 157
+search_view_inlink = ""
+;
 ; settings_view.php line: 76
 settings_view_settings = ""
 ;
diff --git a/locale/zh-CN/configure.ini b/locale/zh-CN/configure.ini
index b358f233b..7b9eb7f87 100755
--- a/locale/zh-CN/configure.ini
+++ b/locale/zh-CN/configure.ini
@@ -199,7 +199,7 @@ admin_controller_describe_robot = ""
 ; search_controller.php line: 119
 search_controller_logout_successful = ""
 ;
-; search_controller.php line: 366
+; search_controller.php line: 369
 search_controller_cached_version = ""
 ;
 ; settings_controller.php line: 134
@@ -231,49 +231,52 @@ crawlstatus_view_time_started = ""
 ; crawlstatus_view.php line: 77
 crawlstatus_view_no_crawl_time = ""
 ;
-; crawlstatus_view.php line: 79
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 78
+crawlstatus_view_visited_urls = ""
 ;
 ; crawlstatus_view.php line: 82
+crawlstatus_view_total_urls = ""
+;
+; crawlstatus_view.php line: 85
 crawlstatus_view_most_recent_fetcher = ""
 ;
-; crawlstatus_view.php line: 88
+; crawlstatus_view.php line: 91
 crawlstatus_view_no_fetcher = ""
 ;
-; crawlstatus_view.php line: 91
+; crawlstatus_view.php line: 94
 crawlstatus_view_most_recent_urls = ""
 ;
-; crawlstatus_view.php line: 99
+; crawlstatus_view.php line: 102
 crawlstatus_view_no_recent_urls = ""
 ;
-; crawlstatus_view.php line: 103
+; crawlstatus_view.php line: 106
 crawlstatus_view_previous_crawls = ""
 ;
-; crawlstatus_view.php line: 109
+; crawlstatus_view.php line: 112
 crawlstatus_view_description = ""
 ;
-; crawlstatus_view.php line: 110
+; crawlstatus_view.php line: 113
 crawlstatus_view_time_started = ""
 ;
-; crawlstatus_view.php line: 111
-crawlstatus_view_total_urls = ""
+; crawlstatus_view.php line: 114
+crawlstatus_view_url_counts = ""
 ;
-; crawlstatus_view.php line: 112
+; crawlstatus_view.php line: 115
 crawlstatus_view_actions = ""
 ;
-; crawlstatus_view.php line: 121
+; crawlstatus_view.php line: 126
 crawlstatus_view_resume = ""
 ;
-; crawlstatus_view.php line: 127
+; crawlstatus_view.php line: 132
 crawlstatus_view_set_index = ""
 ;
-; crawlstatus_view.php line: 130
+; crawlstatus_view.php line: 135
 crawlstatus_view_search_index = ""
 ;
-; crawlstatus_view.php line: 137
+; crawlstatus_view.php line: 142
 crawlstatus_view_delete = ""
 ;
-; crawlstatus_view.php line: 144
+; crawlstatus_view.php line: 149
 crawlstatus_view_no_previous_crawl = ""
 ;
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/elements
@@ -582,15 +585,18 @@ search_view_relevancy = ""
 ; search_view.php line: 133
 search_view_score = "分數"
 ;
-; search_view.php line: 142
+; search_view.php line: 144
 search_view_cache = ""
 ;
-; search_view.php line: 145
+; search_view.php line: 147
 search_view_as_text = ""
 ;
-; search_view.php line: 151
+; search_view.php line: 153
 search_view_similar = ""
 ;
+; search_view.php line: 157
+search_view_inlink = ""
+;
 ; settings_view.php line: 76
 settings_view_settings = "設定"
 ;
diff --git a/locale/zh-CN/statistics.txt b/locale/zh-CN/statistics.txt
index b26155324..039ce78b2 100755
--- a/locale/zh-CN/statistics.txt
+++ b/locale/zh-CN/statistics.txt
@@ -1 +1 @@
-d:31;
\ No newline at end of file
+d:30;
\ No newline at end of file
diff --git a/models/crawl_model.php b/models/crawl_model.php
index 1cf075d31..03d9d95a2 100755
--- a/models/crawl_model.php
+++ b/models/crawl_model.php
@@ -191,6 +191,9 @@ class CrawlModel extends Model implements CrawlConstants
                     substr($pre_timestamp, strlen(self::index_data_base_name));
                 $info = IndexArchiveBundle::getArchiveInfo($dir);
                 $crawl['DESCRIPTION'] = $info['DESCRIPTION'];
+                $crawl['VISITED_URLS_COUNT'] =
+                    isset($info['VISITED_URLS_COUNT']) ?
+                    $info['VISITED_URLS_COUNT'] : 0;
                 $crawl['COUNT'] = $info['COUNT'];
                 $crawl['NUM_PARTITIONS'] = $info['NUM_PARTITIONS'];
                 $list[] = $crawl;
diff --git a/models/phrase_model.php b/models/phrase_model.php
index eb0d38795..5cf646d6d 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -46,6 +46,14 @@ require_once BASE_DIR."/lib/utility.php";
  */
 require_once BASE_DIR."/lib/index_archive_bundle.php";

+/**
+ * Load iterators to get docs out of index archive
+ */
+foreach(glob(BASE_DIR."/lib/index_bundle_iterators/*_iterator.php")
+    as $filename) {
+    require_once $filename;
+}
+
 /**
  *
  * This is class is used to handle
@@ -89,44 +97,106 @@ class PhraseModel extends Model
         $format = true)
     {

-        $index_archive_name = self::index_data_base_name . $this->index_name;
+        $results = NULL;
+        $word_structs = array();
+        /*
+            this is a quick and dirty parsing and will usually work,
+            exceptions would be | in quotes or if someone tried
+            to escape |.
+        */
+        $disjunct_phrases = explode("|", $phrase);
+        foreach($disjunct_phrases as $disjunct) {
+            list($word_struct, $format_words) =
+                $this->parseWordStructConjunctiveQuery($disjunct);
+            if($word_struct != NULL) {
+                $word_structs[] = $word_struct;
+            }
+        }
+
+        $results = $this->getSummariesByHash($word_structs,
+            $low, $results_per_page);
+        if(count($results) == 0) {
+            $results = NULL;
+        }
+        if($results == NULL) {
+            $results['TOTAL_ROWS'] = 0;
+        }
+
+        if($format) {
+            if(count($format_words) == 0 ){
+                $format_words = NULL;
+            }
+        } else {
+            $format_words = NULL;
+        }
+
+
+        $output = $this->formatPageResults($results, $format_words);
+
+        return $output;

+    }
+
+
+    function parseWordStructConjunctiveQuery($phrase)
+    {
+        $phrase = " ".$phrase;
+        $phrase_string = $phrase;
+        $meta_words = array('link\:', 'site\:',
+            'filetype\:', 'info\:', '\-',
+            'index:', 'i:', 'weight:', 'w:');
+        $index_name = $this->index_name;
+        $weight = 1;
+        $found_metas = array();
+        $disallow_phrases = array();
+        foreach($meta_words as $meta_word) {
+            $pattern = "/(\s)($meta_word(\S)+)/";
+            preg_match_all($pattern, $phrase, $matches);
+            if(in_array($meta_word, array('link\:', 'site\:',
+            'filetype\:', 'info\:') )) {
+                $found_metas = array_merge($found_metas, $matches[2]);
+            } else if($meta_word == '\-') {
+                if(count($matches[0]) > 0) {
+                    $disallow_phrases =
+                        array_merge($disallow_phrases,
+                            array(substr($matches[2][0],2)));
+                }
+            } else if ($meta_word == "i:" || $meta_word == "index:") {
+                if(isset($matches[2][0])) {
+                    $index_name = substr($matches[2][0],strlen($meta_word));
+                }
+            } else if ($meta_word == "w:" || $meta_word == "weight:") {
+                if(isset($matches[2][0])) {
+                    $weight = substr($matches[2][0],strlen($meta_word));
+                }
+            }
+            $phrase_string = preg_replace($pattern,"", $phrase_string);
+        }
+
+        $index_archive_name = self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle(
             CRAWL_DIR.'/cache/'.$index_archive_name);

-        $results = NULL;
-
-        $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $phrase);
+        $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $phrase_string);
         $phrase_string = preg_replace("/(\s)+/", " ", $phrase_string);
+
         /*
             we search using the stemmed words, but we format snippets in the
             results by bolding either
          */
         $query_words = explode(" ", $phrase_string); //not stemmed
-        $words =
+        $base_words =
             array_keys(PhraseParser::extractPhrasesAndCount($phrase_string));
             //stemmed
+
+        $words = array_merge($base_words, $found_metas);
         if(isset($words) && count($words) == 1) {
             $phrase_string = $words[0];
-        }
-        $phrase_hash = crawlHash($phrase_string);
-
-        $phrase_info = $index_archive->getPhraseIndexInfo($phrase_hash);
-        if(isset($phrase_info[IndexingConstants::PARTIAL_COUNT]) &&
-            $phrase_info[IndexingConstants::PARTIAL_COUNT] <
-                $low + $results_per_page) {
-            $phrase_info = NULL;
-        }
-
-        if($phrase_info  != NULL) {
-
-            $results = $index_archive->getSummariesByHash(
-                $phrase_hash, $low, $results_per_page, NULL, NULL, $phrase_info);
-
-            if(count($results) == 0) {
-                $results = NULL;
-            }
-
+            $phrase_hash = crawlHash($phrase_string);
+            $word_struct = array("KEYS" => array($phrase_hash),
+                "RESTRICT_PHRASES" => NULL, "DISALLOW_PHRASES" => NULL,
+                "WEIGHT" => $weight, "INDEX_ARCHIVE" => $index_archive
+            );
         } else {
             /*
                 handle strings in quotes
@@ -138,13 +208,6 @@ class PhraseModel extends Model
                 preg_match_all('/\"((?:[^\"\\\]|\\\\.)*)\"/', $phrase,$quoteds);
             if(isset($quoteds[1])) {
                 $quoteds = $quoteds[1];
-                foreach($quoteds as $quote_phrase) {
-                    $hash_quote = crawlHash($quote_phrase);
-                    if($index_archive->getPhraseIndexInfo($hash_quote) != NULL){
-                        $hash_quoteds[] = $hash_quote;
-                    }
-                }
-
             }

             //get a raw list of words and their hashes
@@ -154,42 +217,42 @@ class PhraseModel extends Model
                 $tmp = crawlHash($word);
                 $hashes[] = $tmp;
             }
-            $hashes = array_merge($hashes, $hash_quoteds);
+
             $restrict_phrases = array_merge($query_words, $quoteds);
-
-
+
             $hashes = array_unique($hashes);
             $restrict_phrases = array_unique($restrict_phrases);
-
-            $words_array = $index_archive->getSelectiveWords($hashes, 1);
-            $word_keys = array_keys($words_array);
-            $word_key = $word_keys[0];
-            $count = $words_array[$word_key];
-            if($count > 0 ) {
-                $results = $index_archive->getSummariesByHash(
-                    $word_key, $low, $results_per_page,
-                    $restrict_phrases, $phrase_hash);
+            $restrict_phrases = array_filter($restrict_phrases);
+            $words_array = $index_archive->getSelectiveWords($hashes, 10);
+
+            if(is_array($words_array)) {
+                reset($words_array);
+                $word_key = key($words_array);
+                $word_count = $words_array[$word_key];
+                foreach($words_array as $key => $count) {
+                    if($count > 3 * $word_count) {
+                        unset($words_array[$key]);
+                    }
+                }
+                $word_keys = array_keys($words_array);
+                $word_struct = array("KEYS" => $word_keys,
+                    "RESTRICT_PHRASES" => $restrict_phrases,
+                    "DISALLOW_PHRASES" => $disallow_phrases,
+                    "WEIGHT" => $weight,
+                    "INDEX_ARCHIVE" => $index_archive
+                );
+                if($word_count <= 0 ) {
+                    $word_struct = NULL;
+                }
+            } else {
+                $word_struct = NULL;
             }
         }
+        $format_words = array_merge($query_words, $base_words);

-        if($results == NULL) {
-            $results['TOTAL_ROWS'] = 0;
-        }
-
-        if($format) {
-            $formatted_words = array_merge($query_words, $words);
-        } else {
-            $formatted_words = NULL;
-        }
-
-
-        $output = $this->formatPageResults($results, $formatted_words);
-
-        return $output;
-
+        return array($word_struct, $format_words);
     }

-
     /**
      * Given a page summary extract the words from it and try to find documents
      * which match the most relevant words. The algorithm for "relevant" is
@@ -234,6 +297,93 @@ class PhraseModel extends Model

     }

+    /**
+     * Gets doc summaries of documents containing given words and meeting the
+     * additional provided criteria
+     * @param array $word_structs an array of word_structs. Here a word_struct
+     *      is an associative array with at least the following fields
+     *      KEYS -- an array of word keys
+     *      RESTRICT_PHRASES -- an array of phrases the document must contain
+     *      DISALLOW_PHRASES -- an array of words the document must not contain
+     *      WEIGHT -- a weight to multiple scores returned from this iterator by
+     *      INDEX_ARCHIVE -- an index_archive object to get results from
+     * @param int $limit number of first document in order to return
+     * @param int $num number of documents to return summaries of
+     * @param object $index_archive index archive to use to get summaries from
+     * @return array document summaries
+     */
+    function getSummariesByHash($word_structs, $limit, $num)
+    {
+
+        $iterators = array();
+        foreach($word_structs as $word_struct) {
+            if(!is_array($word_struct)) { continue;}
+            $word_keys = $word_struct["KEYS"];
+            $restrict_phrases = $word_struct["RESTRICT_PHRASES"];
+            $disallow_phrases = $word_struct["DISALLOW_PHRASES"];
+            $index_archive = $word_struct["INDEX_ARCHIVE"];
+            $weight = $word_struct["WEIGHT"];
+            $num_word_keys = count($word_keys);
+            if($num_word_keys < 1) {continue;}
+
+            for($i = 0; $i < $num_word_keys; $i++) {
+                $word_iterators[$i] =
+                    new WordIterator($word_keys[$i], $index_archive, 0);
+            }
+            if($num_word_keys == 1) {
+                $base_iterator = $word_iterators[0];
+            } else {
+                $base_iterator = new IntersectIterator($word_iterators, 0);
+            }
+            if($restrict_phrases == NULL && $disallow_phrases == NULL &&
+                $weight == 1) {
+                $iterators[] = $base_iterator;
+            } else {
+                $iterators[] = new PhraseFilterIterator($base_iterator,
+                    $restrict_phrases, $disallow_phrases, $weight, 0);
+            }
+
+        }
+        $num_iterators = count($iterators);
+        if( $num_iterators < 1) {
+            return NULL;
+        } else if($num_iterators == 1) {
+            $union_iterator = $iterators[0];
+        } else {
+            $union_iterator = new UnionIterator($iterators, 0);
+        }
+
+        $to_retrieve = $limit + max(2*$num, 200);
+        $group_iterator = new GroupIterator($union_iterator, 0);
+        $num_retrieved = 0;
+        $pages = array();
+        while(is_array($next_docs = $group_iterator->nextDocsWithWord()) &&
+            $num_retrieved < $to_retrieve) {
+             foreach($next_docs as $doc_key => $doc_info) {
+                 $summary = & $doc_info[CrawlConstants::SUMMARY];
+                 unset($doc_info[CrawlConstants::SUMMARY]);
+                 $pages[] = array_merge($doc_info, $summary);
+                 $num_retrieved++;
+                 if($num_retrieved >=  $to_retrieve) {
+
+                     break 2;
+                 }
+             }
+        }
+        uasort($pages, "scoreOrderCallback");
+        $pages = array_slice($pages, $limit, $num);
+        if($num_retrieved < $to_retrieve && $limit<=$group_iterator->num_docs) {
+            $results['TOTAL_ROWS'] = $num_retrieved;
+        } else {
+            $results['TOTAL_ROWS'] = max($group_iterator->num_docs,
+                $num_retrieved);
+            /*num_docs is only approximate, so if gives contradictory info
+              use $num_retrieved */
+        }
+        $results['PAGES'] = $pages;
+        return $results;
+    }
+
 }

 ?>
diff --git a/views/crawlstatus_view.php b/views/crawlstatus_view.php
index b7a92e888..452c5e8f7 100755
--- a/views/crawlstatus_view.php
+++ b/views/crawlstatus_view.php
@@ -76,7 +76,10 @@ class CrawlstatusView extends View
         <?php
         if(isset($data['CRAWL_TIME'])) {  e(date("r",$data['CRAWL_TIME'])); }
             else {e(tl('crawlstatus_view_no_crawl_time'));} ?></p>
-
+        <p><b><?php e(tl('crawlstatus_view_visited_urls')); ?></b> <?php
+            if(isset($data['VISITED_URLS_COUNT'])) {
+                e($data['VISITED_URLS_COUNT']); } else {e("0");}
+            ?></p>
         <p><b><?php e(tl('crawlstatus_view_total_urls')); ?></b> <?php
             if(isset($data['COUNT'])) { e($data['COUNT']); } else {e("0");}
             ?></p>
@@ -109,14 +112,16 @@ class CrawlstatusView extends View
             <table class="crawlstable">
             <tr><th><?php e(tl('crawlstatus_view_description'));?></th><th><?php
                 e(tl('crawlstatus_view_time_started')); ?></th>
-            <th><?php e(tl('crawlstatus_view_total_urls'));?></th>
+            <th><?php e(tl('crawlstatus_view_url_counts'));?></th>
             <th colspan="3"><?php e(tl('crawlstatus_view_actions'));?></th></tr>
             <?php
             foreach($data['RECENT_CRAWLS'] as $crawl) {
             ?>
                 <tr><td><b><?php e($crawl['DESCRIPTION']); ?></b></td><td> <?php
                     e(date("r", $crawl['CRAWL_TIME'])); ?></td>
-                <td> <?php  e( $crawl['COUNT']); ?></td>
+                <td> <?php e( (isset($crawl["VISITED_URLS_COUNT"]) ?
+                    $crawl['VISITED_URLS_COUNT'] : 0) ."/".
+                    $crawl['COUNT']); ?></td>
                 <td><a href="<?php e($base_url); ?>resume&timestamp=<?php
                     e($crawl['CRAWL_TIME']); ?>"><?php
                     e(tl('crawlstatus_view_resume'));?></a></td>
diff --git a/views/search_view.php b/views/search_view.php
index 885eef536..9be7cc805 100755
--- a/views/search_view.php
+++ b/views/search_view.php
@@ -114,7 +114,10 @@ class SearchView extends View implements CrawlConstants
             foreach($data['PAGES'] as $page) {?>
                 <div class='result'>
                 <h2>
-                <a href="<?php e($page[self::URL]); ?>" ><?php
+                <a href="<?php if($page[self::TYPE] != "link") {
+                    e($page[self::URL]);
+                    } else
+                    e(strip_tags($page[self::TITLE])); ?>" ><?php
                  if(isset($page[self::THUMB]) && $page[self::THUMB] != 'NULL') {
                     ?><img src="<?php e($page[self::THUMB]); ?>" alt="<?php
                         e($page[self::TITLE]); ?>"  /> <?php
@@ -125,31 +128,44 @@ class SearchView extends View implements CrawlConstants
                 ?></a></h2>
                 <p><?php
                 echo $page[self::DESCRIPTION]; ?></p>
-                <p class="echolink" ><?php e($page[self::URL]." ");
+                <p class="echolink" ><?php
+                    e(substr($page[self::URL],0, 200)." ");
                     e(tl('search_view_rank',
-                        number_format($page[self::DOC_RANK], 2)));
+                        number_format($page[self::DOC_RANK], 2)));
+                    $page["WEIGHT"] = (isset($page["WEIGHT"])) ?
+                        $page["WEIGHT"] : 1;
                     e(tl('search_view_relevancy',
-                        number_format(1.25*floatval($page[self::SCORE])
-                        - floatval($page[self::DOC_RANK]), 2) ));
-                    e(tl('search_view_score', 1.25* $page[self::SCORE]));?>
-                <a href="?c=search&amp;a=cache&amp;q=<?php
-                    e($data['QUERY']); ?>&amp;arg=<?php
-                    e(urlencode($page[self::URL]));
-                    ?>&amp;so=<?php  e($page[self::SUMMARY_OFFSET]);
-                    ?>&amp;its=<?php e($data['its']); ?>" >
-                <?php
-                if($page[self::TYPE] == "text/html" ||
-                    stristr($page[self::TYPE], "image")) {
-                    e(tl('search_view_cache'));
+                        number_format((1.25*floatval($page[self::SCORE])
+                        - floatval($page[self::DOC_RANK]))
+                        / $page["WEIGHT"] , 2) ));
+                    e(tl('search_view_score', 1.25* $page[self::SCORE]));
+                if($page[self::TYPE] != "link") {
+                ?>
+                    <a href="?c=search&amp;a=cache&amp;q=<?php
+                        e($data['QUERY']); ?>&amp;arg=<?php
+                        e(urlencode($page[self::URL]));
+                        ?>&amp;so=<?php  e($page[self::SUMMARY_OFFSET]);
+                        ?>&amp;its=<?php e($data['its']); ?>" >
+                    <?php
+                    if($page[self::TYPE] == "text/html" ||
+                        stristr($page[self::TYPE], "image")) {
+                        e(tl('search_view_cache'));

-                } else {
-                    e(tl('search_view_as_text'));
-                }
-                ?></a>. <a href="?c=search&amp;a=related&amp;arg=<?php
-                    e(urlencode($page[self::URL])); ?>&amp;so=<?php
-                    e($page[self::SUMMARY_OFFSET]);
-                    ?>&amp;its=<?php e($data['its']); ?>" ><?php
-                    e(tl('search_view_similar')); ?></a>.</p>
+                    } else {
+                        e(tl('search_view_as_text'));
+                    }
+                    ?></a>. <a href="?c=search&amp;a=related&amp;arg=<?php
+                        e(urlencode($page[self::URL])); ?>&amp;so=<?php
+                        e($page[self::SUMMARY_OFFSET]);
+                        ?>&amp;its=<?php e($data['its']); ?>" ><?php
+                        e(tl('search_view_similar'));
+                    ?></a>. <a href="?c=search&amp;q=<?php
+                        e("link:".urlencode($page[self::URL])); ?>&amp;
+                        its=<?php e($data['its']); ?>" ><?php
+                        e(tl('search_view_inlink'));
+                    ?></a>.</p>
+                <?php
+                } ?>
                 </div>

             <?php

ViewGit