Add raw attribute to query string, a=chris

Chris Pollett [2011-10-18 20:Oct:th]
Add raw attribute to query string, a=chris
Filename
bin/fetcher.php
controllers/search_controller.php
lib/fetch_url.php
lib/index_bundle_iterators/group_iterator.php
locale/fr-FR/configure.ini
locale/ja/configure.ini
locale/vi-VN/configure.ini
models/phrase_model.php
diff --git a/bin/fetcher.php b/bin/fetcher.php
index 6c1339626..99d0bd0da 100755
--- a/bin/fetcher.php
+++ b/bin/fetcher.php
@@ -703,7 +703,7 @@ class Fetcher implements CrawlConstants
     }

     /**
-     *
+     * @param array &$info
      */
     function setCrawlParamsFromArray(&$info)
     {
diff --git a/controllers/search_controller.php b/controllers/search_controller.php
index 34046c89c..36f3b5bb9 100755
--- a/controllers/search_controller.php
+++ b/controllers/search_controller.php
@@ -90,13 +90,19 @@ class SearchController extends Controller implements CrawlConstants
     {
         $data = array();
         $view = "search";
+        $start_time = microtime();
+
         if(isset($_REQUEST['f']) && $_REQUEST['f']=='rss' &&
             RSS_ACCESS) {
             $view = "rss";
         } else if (!WEB_ACCESS) {
             return;
         }
-        $start_time = microtime();
+        if(isset($_REQUEST['raw']) && $_REQUEST['raw'] == true) {
+            $raw = true;
+        } else {
+            $raw = false;
+        }

         if(isset($_SESSION['MAX_PAGES_TO_SHOW']) ) {
             $results_per_page = $_SESSION['MAX_PAGES_TO_SHOW'];
@@ -180,7 +186,7 @@ class SearchController extends Controller implements CrawlConstants
                 $data =
                     $this->processQuery(
                         $query, $activity, $arg,
-                        $results_per_page, $limit, $index_time_stamp);
+                        $results_per_page, $limit, $index_time_stamp, $raw);
                         // calculate the results of a search if there is one
             } else {
                 $highlight = true;
@@ -236,10 +242,14 @@ class SearchController extends Controller implements CrawlConstants
      *      for those query terms will be return, then the eleventh, etc.
      * @param int $index_name the timestamp of an index to use, if 0 then
      *      default used
+     * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
+     *      no grouping but page look-up for links, ($raw == 2)
+     *      no grouping done on data
+     *
      * @return array an array of at most results_per_page many search results
      */
     function processQuery($query, $activity, $arg, $results_per_page,
-        $limit = 0, $index_name = 0)
+        $limit = 0, $index_name = 0, $raw = 0)
     {
         $no_index_given = false;
         if($index_name == 0) {
@@ -291,7 +301,7 @@ class SearchController extends Controller implements CrawlConstants
                 $top_query = implode(" ", $top_phrases);
                 $phrase_results = $this->phraseModel->getPhrasePageResults(
                     $top_query, $limit, $results_per_page, false, NULL,
-                    $use_cache_if_possible);
+                    $use_cache_if_possible, $raw);
                 $data['PAGING_QUERY'] = "index.php?c=search&a=related&arg=".
                     urlencode($url);

@@ -329,7 +339,7 @@ class SearchController extends Controller implements CrawlConstants
                     $filter = $this->searchfiltersModel->getFilter();
                     $phrase_results = $this->phraseModel->getPhrasePageResults(
                         $query, $limit, $results_per_page, true, $filter,
-                        $use_cache_if_possible);
+                        $use_cache_if_possible, $raw);
                     $query = $original_query;
                 }
                 $data['PAGING_QUERY'] = "index.php?q=".urlencode($query);
@@ -451,14 +461,18 @@ class SearchController extends Controller implements CrawlConstants
      *      cache: queries)
      * @param int $results_per_page number of results to return
      * @param int $limit first result to return from the ordered query results
+     * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
+     *      no grouping but page look-up for links, ($raw == 2)
+     *      no grouping done on data
      *
      * @return array associative array of results for the query performed
      */
-    public function queryRequest($query, $results_per_page, $limit = 0)
+    public function queryRequest($query, $results_per_page, $limit = 0,
+        $raw = 0)
     {
         return (API_ACCESS) ?
             $this->processQuery($query, "query", "", $results_per_page,
-            $limit) : NULL;
+                $limit, $raw) : NULL;
     }

     /**
@@ -468,16 +482,18 @@ class SearchController extends Controller implements CrawlConstants
      * @param string $url to find related documents for
      * @param int $results_per_page number of results to return
      * @param int $limit first result to return from the ordered query results
+     * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
+     *      no grouping but page look-up for links, ($raw == 2)
+     *      no grouping done on data
      *
      * @return array associative array of results for the query performed
-
      */
     public function relatedRequest($url, $results_per_page, $limit = 0,
-        $crawl_time = 0)
+        $crawl_time = 0, $raw = 0)
     {
         return (API_ACCESS) ?
             $this->processQuery("", "related", $url, $results_per_page,
-            $limit, $crawl_time) : NULL;
+                $limit, $crawl_time, $raw) : NULL;
     }

     /**
diff --git a/lib/fetch_url.php b/lib/fetch_url.php
index b3f4a486f..4cabe25d9 100755
--- a/lib/fetch_url.php
+++ b/lib/fetch_url.php
@@ -69,8 +69,6 @@ class FetchUrl implements CrawlConstants
         $key=CrawlConstants::URL, $value=CrawlConstants::PAGE,
         $hash=CrawlConstants::HASH)
     {
-        static $ex_cnt = 0;
-
         $agent_handler = curl_multi_init();

         $active = NULL;
diff --git a/lib/index_bundle_iterators/group_iterator.php b/lib/index_bundle_iterators/group_iterator.php
index 8426ef8f9..d998f934a 100644
--- a/lib/index_bundle_iterators/group_iterator.php
+++ b/lib/index_bundle_iterators/group_iterator.php
@@ -100,10 +100,20 @@ class GroupIterator extends IndexBundleIterator
     var $grouped_hashes;

     /**
+     * Used to keep track and to weight pages based on the number of other
+     * pages from the same domain
      * @var array
      */
     var $domain_factors;

+    /**
+     * Flag used to tell group iterator whether to do a usual grouping
+     * or to only look-up parent pages for links for which a parent page
+     * hasn't been seen
+     * @var bool
+     */
+    var $only_lookup;
+
     /**
      * the minimum number of pages to group from a block;
      * this trumps $this->index_bundle_iterator->results_per_block
@@ -121,15 +131,21 @@ class GroupIterator extends IndexBundleIterator
      * @param object $index_bundle_iterator to use as a source of documents
      *      to iterate over
      */
-    function __construct($index_bundle_iterator, $num_iterators = 1)
+    function __construct($index_bundle_iterator, $num_iterators = 1,
+        $only_lookup = false)
     {
         $this->index_bundle_iterator = $index_bundle_iterator;
         $this->num_docs = $this->index_bundle_iterator->num_docs;
-        $this->results_per_block = max(
-            $this->index_bundle_iterator->results_per_block,
-            self::MIN_FIND_RESULTS_PER_BLOCK);
-
-        $this->results_per_block /=  ceil($num_iterators/2);
+        if($only_lookup) {
+            $this->results_per_block =
+                $this->index_bundle_iterator->results_per_block;
+        } else {
+            $this->results_per_block = max(
+                $this->index_bundle_iterator->results_per_block,
+                self::MIN_FIND_RESULTS_PER_BLOCK);
+            $this->results_per_block /=  ceil($num_iterators/2);
+        }
+        $this->only_lookup = $only_lookup;
         $this->reset();
     }

@@ -179,21 +195,28 @@ class GroupIterator extends IndexBundleIterator
         $this->current_block_hashes = array();
         $this->current_seen_hashes = array();
         if($this->count_block_unfiltered > 0 ) {
-            /* next we group like documents by url and remember which urls we've
-               seen this block
-            */
-
-            $pre_out_pages = $this->groupByHashUrl($pages);
-
-           /*get doc page for groups of link data if exists and don't have
-             also aggregate by hash
-           */
-           $this->groupByHashAndAggregate($pre_out_pages);
-           $this->count_block = count($pre_out_pages);
-            /*
-                Calculate aggregate values for each field of the groups we found
-             */
-            $pages = $this->computeOutPages($pre_out_pages);
+            if($this->only_lookup) {
+
+                $pages = $this->insertUnseenDocs($pages);
+                $this->count_block = count($pages);
+            } else {
+                /* next we group like documents by url and remember
+                   which urls we've seen this block
+                */
+
+                $pre_out_pages = $this->groupByHashUrl($pages);
+
+               /*get doc page for groups of link data if exists and don't have
+                 also aggregate by hash
+               */
+               $this->groupByHashAndAggregate($pre_out_pages);
+               $this->count_block = count($pre_out_pages);
+                /*
+                    Calculate aggregate values for each field of the groups we
+                    found
+                 */
+                $pages = $this->computeOutPages($pre_out_pages);
+            }
         }
         $this->pages = $pages;
         return $pages;
@@ -293,44 +316,24 @@ class GroupIterator extends IndexBundleIterator
     {
         $domain_vector = array();
         foreach($pre_out_pages as $hash_url => $data) {
-            if(!$pre_out_pages[$hash_url][0][self::IS_DOC]) {
-                $hash_info_url=
-                    crawlHash("info:".base64Hash($hash_url), true);
-                $index = $this->getIndex($pre_out_pages[$hash_url][0]['KEY']);
-                $word_iterator =
-                     new WordIterator($hash_info_url,
-                        $index, true);
-                $doc_array = $word_iterator->currentDocsWithWord();
-                if(is_array($doc_array) && count($doc_array) == 1) {
-                    $relevance =  $this->computeRelevance(
-                        $word_iterator->current_generation,
-                        $word_iterator->current_offset);
-                    $keys = array_keys($doc_array);
-                    $key = $keys[0];
-                    $item = $doc_array[$key];
-                    $item[self::RELEVANCE] = $relevance;
-                    $item[self::SCORE] += $relevance;
-                    $item['KEY'] = $key;
-                    $item['INDEX'] = $word_iterator->index;
-                    $item[self::HASH] = substr($key,
-                        IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
-                    $item[self::INLINKS] = substr($key,
-                        2*IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+            if(!$data[0][self::IS_DOC]) {
+                $item = $this->lookupDoc($data[0]['KEY']);
+                if($item != false) {
                     array_unshift($pre_out_pages[$hash_url], $item);
                 }
             }

             $this->aggregateScores($hash_url, $pre_out_pages[$hash_url]);

-            if(isset($pre_out_pages[$hash_url][0][self::HASH])) {
-                $hash = $pre_out_pages[$hash_url][0][self::HASH];
+            if(isset($pre_out_pages[$hash_url][self::HASH])) {
+                $hash = $pre_out_pages[$hash_url][self::HASH];
                 if(isset($this->grouped_hashes[$hash])) {
                     unset($pre_out_pages[$hash_url]);
                 } else if(isset($this->current_seen_hashes[$hash])) {
                     $previous_url = $this->current_seen_hashes[$hash];
                     if($pre_out_pages[$previous_url][0][
                         self::HASH_SUM_SCORE] >=
-                        $pre_out_pages[$hash_url][0][self::HASH_SUM_SCORE]) {
+                        $pre_out_pages[$hash_url][0][self::HASH_SUM_SCORE]){
                         unset($pre_out_pages[$hash_url]);
                     } else {
                         $this->current_seen_hashes[$hash] = $hash_url;
@@ -343,6 +346,109 @@ class GroupIterator extends IndexBundleIterator
         }
     }

+    /**
+     * Looks up a doc for a link doc_key, so can get its summary info
+     *
+     * @param string $doc_key key to look up doc of
+     *
+     * @return array consisting of info about the doc
+     */
+     function lookupDoc($doc_key)
+     {
+        $hash_url = substr($doc_key, 0, IndexShard::DOC_KEY_LEN);
+        $hash_info_url=
+            crawlHash("info:".base64Hash($hash_url), true);
+        $index = $this->getIndex($doc_key);
+        $word_iterator =
+             new WordIterator($hash_info_url,
+                $index, true);
+        $doc_array = $word_iterator->currentDocsWithWord();
+        $item = false;
+        if(is_array($doc_array) && count($doc_array) == 1) {
+            $relevance =  $this->computeRelevance(
+                $word_iterator->current_generation,
+                $word_iterator->current_offset);
+            $keys = array_keys($doc_array);
+            $key = $keys[0];
+            $item = $doc_array[$key];
+            $item[self::RELEVANCE] = $relevance;
+            $item[self::SCORE] = $item[self::DOC_RANK]*pow(1.1, $relevance);
+            $item['KEY'] = $key;
+            $item['INDEX'] = $word_iterator->index;
+            $item[self::HASH] = substr($key,
+                IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+            $item[self::INLINKS] = substr($key,
+                2*IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+        }
+        return $item;
+     }
+
+    /**
+     *  This function is called if $raw mode 1 was requested. In this
+     *  mode no grouping is done, but it a link does not correspond to
+     *  a doc file already listed, then an attempt to look up the doc is
+     *  done
+     *
+     *  @param array $pages an array of links or docs returned by the
+     *     iterator that had been fed into this group iterator
+     *
+     *  @return array new pages where docs have been added if possible
+     */
+     function insertUnseenDocs($pages)
+     {
+        $new_pages = array();
+        $doc_keys = array_keys($pages);
+        $need_docs = array();
+        foreach($doc_keys as $key) {
+           $hash_url = substr($key, 0, IndexShard::DOC_KEY_LEN);
+           $need_docs[$hash_url] = $key;
+        }
+        $need_docs = array_diff_key($need_docs, $this->grouped_keys);
+        foreach($pages as $doc_key => $doc_info) {
+            $doc_info['KEY'] = $doc_key;
+            $hash_url = substr($doc_key, 0, IndexShard::DOC_KEY_LEN);
+            $doc_info[self::HASH] = substr($doc_key,
+                IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+            // inlinks is the domain of the inlink
+            $doc_info[self::INLINKS] = substr($doc_key,
+                2 * IndexShard::DOC_KEY_LEN, IndexShard::DOC_KEY_LEN);
+            $new_pages[$doc_key] = $doc_info;
+            if($doc_info[self::IS_DOC]) {
+                if(isset($need_docs[$hash_url])) {
+                    unset($need_docs[$hash_url]);
+                }
+            }
+            if(!isset($this->grouped_keys[$hash_url])) {
+                /*
+                    new url found in this block
+                */
+                $this->current_block_hashes[] = $hash_url;
+            }
+        }
+
+        $item_pages = array();
+        if(is_array($need_docs)) {
+            $need_docs = array_unique($need_docs);
+            foreach($need_docs as $hash_url => $doc_key) {
+                $item = $this->lookupDoc($doc_key);
+                if($item != false) {
+                    $item_pages[$hash_url] = $item;
+                }
+            }
+        }
+
+        $new_pages = array_merge($new_pages, $item_pages);
+
+        foreach($new_pages as $doc_key => $doc_info) {
+            $new_pages[$doc_key][self::SUMMARY_OFFSET] = array();
+            $new_pages[$doc_key][self::SUMMARY_OFFSET][] =
+                array($doc_info["KEY"], $doc_info[self::GENERATION],
+                        $doc_info[self::SUMMARY_OFFSET]);
+        }
+
+        return $new_pages;
+     }
+
     /**
      * For a collection of grouped pages generates a grouped summary for each
      * group and returns an array of out pages consisting
diff --git a/locale/fr-FR/configure.ini b/locale/fr-FR/configure.ini
index 1537e218f..60a7d9577 100755
--- a/locale/fr-FR/configure.ini
+++ b/locale/fr-FR/configure.ini
@@ -841,10 +841,10 @@ pagination_helper_next = "Proch."
 ; /Applications/XAMPP/xamppfiles/htdocs/git/yioop//views/layouts
 ;
 ; rss_layout.php line: 64
-rss_layout_title = ""
+rss_layout_title = "Moteur de recherche PHP -Yioop! %s"
 ;
 ; rss_layout.php line: 72
-rss_layout_description = ""
+rss_layout_description = "%s Résultats"
 ;
 ; web_layout.php line: 65
 web_layout_title = "Moteur de recherche PHP -Yioop!"
diff --git a/locale/ja/configure.ini b/locale/ja/configure.ini
index 94576f7c3..57d27f49d 100755
--- a/locale/ja/configure.ini
+++ b/locale/ja/configure.ini
@@ -895,7 +895,7 @@ search_view_rank = "ランク:%s"
 search_view_relevancy = "関連:%s"
 ;
 ; search_view.php line: 139
-search_view_proximity = ""
+search_view_proximity = "近さ: %s"
 ;
 ; search_view.php line: 141
 search_view_score = "スコア %s"
diff --git a/locale/vi-VN/configure.ini b/locale/vi-VN/configure.ini
index 1540c2d15..6f390df52 100755
--- a/locale/vi-VN/configure.ini
+++ b/locale/vi-VN/configure.ini
@@ -889,13 +889,13 @@ search_view_calculated = "Đã tính toán trong %s giây
 search_view_results = "Cho kết quả tứ %s - %s của %s"
 ;
 ; search_view.php line: 135
-search_view_rank = "Thứ Tự:"
+search_view_rank = "Thứ Tự: %s"
 ;
 ; search_view.php line: 137
-search_view_relevancy = "Thích hợp:"
+search_view_relevancy = "Thích hợp: %s"
 ;
 ; search_view.php line: 139
-search_view_proximity = ""
+search_view_proximity = "Gần: %s"
 ;
 ; search_view.php line: 141
 search_view_score = "Điểm: %s"
diff --git a/models/phrase_model.php b/models/phrase_model.php
index 8a855ed24..264c8f45d 100755
--- a/models/phrase_model.php
+++ b/models/phrase_model.php
@@ -205,11 +205,16 @@ class PhraseModel extends Model
      *      an attempt will be made to look up the results in either
      *      the file cache or memcache. Otherwise, items will be recomputed
      *      and then potentially restored in cache
+     * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
+     *      no grouping but page look-up for links, ($raw == 2)
+     *      no grouping done on data
+     *
      * @return array an array of summary data
      */
     function getPhrasePageResults(
         $input_phrase, $low = 0, $results_per_page = NUM_RESULTS_PER_PAGE,
-        $format = true, $filter = NULL, $use_cache_if_allowed = true)
+        $format = true, $filter = NULL, $use_cache_if_allowed = true,
+        $raw = 0)
     {
         if(QUERY_STATISTICS) {
             $indent= "  ";
@@ -328,7 +333,7 @@ class PhraseModel extends Model
             }

             $out_results = $this->getSummariesByHash($word_structs,
-                $low, $phrase_num, $filter, $use_cache_if_allowed);
+                $low, $phrase_num, $filter, $use_cache_if_allowed, $raw);

             if(isset($out_results['PAGES']) &&
                 count($out_results['PAGES']) != 0) {
@@ -684,10 +689,14 @@ class PhraseModel extends Model
      *      an attempt will be made to look up the results in either
      *      the file cache or memcache. Otherwise, items will be recomputed
      *      and then potentially restored in cache
+     * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
+     *      no grouping but page look-up for links, ($raw == 2)
+     *      no grouping done on data
+     *
      * @return array document summaries
      */
     function getSummariesByHash($word_structs, $limit, $num, &$filter,
-        $use_cache_if_allowed = true)
+        $use_cache_if_allowed = true, $raw = 0)
     {
         global $CACHE;

@@ -730,7 +739,7 @@ class PhraseModel extends Model
             }
         }

-        $query_iterator = $this->getQueryIterator($word_structs, $filter);
+        $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw);

         $num_retrieved = 0;
         $pages = array();
@@ -748,7 +757,6 @@ class PhraseModel extends Model
                     $num_retrieved++;
                 }
             }
-
         }

         usort($pages, "scoreOrderCallback");
@@ -797,10 +805,15 @@ class PhraseModel extends Model
      *      INDEX_ARCHIVE -- an index_archive object to get results from
      * @param array &$filter an array of hashes of domains to filter from
      *      results
+     *      and then potentially restored in cache
+     * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
+     *      no grouping but page look-up for links, ($raw == 2)
+     *      no grouping done on data
+     *
      * @return &object an iterator for iterating through results to the
      *  query
      */
-    function getQueryIterator($word_structs, &$filter)
+    function getQueryIterator($word_structs, &$filter, $raw = 0)
     {
         $iterators = array();
         $total_iterators = 0;
@@ -858,7 +871,17 @@ class PhraseModel extends Model
             $union_iterator = new UnionIterator($iterators);
         }

-        $group_iterator = new GroupIterator($union_iterator, $total_iterators);
+        $raw = intval($raw);
+        if ($raw == 2) {
+            $group_iterator = $union_iterator;
+        } else if ($raw == 1) {
+
+            $group_iterator =
+                new GroupIterator($union_iterator, $total_iterators, true);
+        } else {
+            $group_iterator =
+                new GroupIterator($union_iterator, $total_iterators);
+        }

         return $group_iterator;
     }
ViewGit