Add ability to add links to cache page via keyword_links mechanism, a=chris

Chris Pollett [2013-03-27 22:Mar:th]
Add ability to add links to cache page via keyword_links mechanism, a=chris
Filename
controllers/search_controller.php
lib/archive_bundle_iterators/text_archive_bundle_iterator.php
views/search_view.php
diff --git a/controllers/search_controller.php b/controllers/search_controller.php
index b6823f1c9..aca296eea 100755
--- a/controllers/search_controller.php
+++ b/controllers/search_controller.php
@@ -130,7 +130,7 @@ class SearchController extends Controller implements CrawlConstants
                     // calculate the results of a search if there is one
             } else {
                 $ui_array = array("highlight", "yioop_nav", "history",
-                    "summaries");
+                    "summaries", "version");
                 if(isset($_REQUEST['from_cache'])) {
                     $ui_array[] = "cache_link_referrer";
                 }
@@ -373,11 +373,9 @@ class SearchController extends Controller implements CrawlConstants
         $machine_urls = $this->machineModel->getQueueServerUrls();

         $current_its = $this->crawlModel->getCurrentIndexDatabaseName();
+        $index_timestamp = $this->getIndexTimestamp();

-        if((isset($_REQUEST['its']) || isset($_SESSION['its']))) {
-            $its = (isset($_REQUEST['its'])) ? $_REQUEST['its'] :
-                $_SESSION['its'];
-            $index_timestamp = $this->clean($its, "int");
+        if($index_timestamp != $current_its) {
             if($raw != 1) {
                 if($index_timestamp != 0 ) {
                     //validate timestamp against list
@@ -409,9 +407,6 @@ class SearchController extends Controller implements CrawlConstants
                         //use the default crawl index
                 }
             }
-        } else {
-            $index_timestamp = $current_its;
-                //use the default crawl index
         }

         $index_info = false;
@@ -441,6 +436,28 @@ class SearchController extends Controller implements CrawlConstants
         return array($index_timestamp, $index_info, $save_timestamp);
     }

+    /**
+     * Finds the timestamp of the main crawl or mix to return results from
+     * Does not do checking to make sure timestamp exists.
+     *
+     * @return string current timestamp
+     */
+    function getIndexTimestamp()
+    {
+        static $index_timestamp = -1;
+        if($index_timestamp != -1) {
+            return $index_timestamp;
+        }
+        if((isset($_REQUEST['its']) || isset($_SESSION['its']))) {
+            $its = (isset($_REQUEST['its'])) ? $_REQUEST['its'] :
+                $_SESSION['its'];
+            $index_timestamp = $this->clean($its, "int");
+        } else {
+            $index_timestamp = $this->crawlModel->getCurrentIndexDatabaseName();
+        }
+        return $index_timestamp;
+    }
+
     /**
      * Sometimes robots disobey the statistics page nofollow meta tag.
      * and need to be stopped before they query the whole index
@@ -1114,19 +1131,10 @@ class SearchController extends Controller implements CrawlConstants
                         the cache before going to the live site
                      */
                     if($tag_name != "link" && ($href =="" || $href[0] != "#")) {
-                        if(isset($_SESSION['USER_ID'])) {
-                            $user = $_SESSION['USER_ID'];
-                        } else {
-                            $user = isset($_SERVER['REMOTE_ADDR']) ?
-                                $_SERVER['REMOTE_ADDR'] : "0.0.0.0";
-                        }
-                        $csrf_token = $this->generateCSRFToken($user);
                         $href = urlencode($href);
                         $href = $href."&from_cache=true";
-                        $crawl_time = $this->crawlModel->
-                            getCurrentIndexDatabaseName();
-                        $href =
-                            "?YIOOP_TOKEN=$csrf_token&c=search&a=cache&q&arg".
+                        $crawl_time = $this->getIndexTimestamp();
+                        $href = $this->baseLink()."&a=cache&q&arg".
                             "=$href&its=$crawl_time";
                     }

@@ -1431,12 +1439,14 @@ class SearchController extends Controller implements CrawlConstants
         } else {
             $cache_file = $cache_item[self::PAGE];
         }
-
         if(isset($crawl_item[self::THUMB])) {
             $cache_file = $this->imageCachePage($url, $cache_item, $cache_file,
                 $queue_servers);
             unset($ui_flags["highlight"]);
         }
+        if(isset($crawl_item[self::KEYWORD_LINKS])) {
+            $cache_item[self::KEYWORD_LINKS] = $crawl_item[self::KEYWORD_LINKS];
+        }
         if(in_array('yioop_nav', $ui_flags)) {
             $newDoc = $this->formatCachePage($cache_item, $cache_file, $url,
                 $summary_string, $crawl_time, $all_crawl_times, $terms,
@@ -1580,13 +1590,13 @@ class SearchController extends Controller implements CrawlConstants
         if(is_object($head)) {
             // add a noindex nofollow robot directive to page
             $head_first_child = $head->firstChild;
-            $robotNode = $dom->createElement('meta');
-            $robotNode = $head->insertBefore($robotNode, $head_first_child);
-            $robotNode->setAttribute("name", "ROBOTS");
-            $robotNode->setAttribute("content", "NOINDEX,NOFOLLOW");
+            $robot_node = $dom->createElement('meta');
+            $robot_node = $head->insertBefore($robot_node, $head_first_child);
+            $robot_node->setAttribute("name", "ROBOTS");
+            $robot_node->setAttribute("content", "NOINDEX,NOFOLLOW");
             $comment = $dom->createComment(
                 tl('search_controller_cache_comment'));
-            $comment = $head->insertBefore($comment, $robotNode);
+            $comment = $head->insertBefore($comment, $robot_node);
             // make link and script links absolute
             $head = $this->canonicalizeLinks($head, $url);
         }
@@ -1608,87 +1618,58 @@ class SearchController extends Controller implements CrawlConstants
         $text_align = (getLocaleDirection() == 'ltr') ? "left" : "right";
         // add information about what was extracted from page
         if(in_array("summaries", $ui_flags)) {
-            $summaryNode = $dom->createElement('pre');
-            $summaryNode = $body->insertBefore($summaryNode, $first_child);
-            $summaryNode->setAttributeNS("","style", "border-color: black; ".
-                "border-style:solid; border-width:3px; text-align:$text_align;".
-                "padding: 5px; background-color: white; display:none;");
-            $summaryNode->setAttributeNS("","id", "summary-page-id");
-
-
-            if(isset($cache_item[self::HEADER])) {
-                $summary_string = $cache_item[self::HEADER]."\n".
-                    $summary_string;
-            }
-            $textNode = $dom->createTextNode($summary_string);
-            $summaryNode->appendChild($textNode);
-
-            $scriptNode = $dom->createElement('script');
-            $scriptNode = $body->insertBefore($scriptNode, $summaryNode);
-            $textNode = $dom->createTextNode("var summary_show = 'none';");
-            $scriptNode->appendChild($textNode);
-            $aDivNode = $dom->createElement('div');
-            $body->insertBefore($aDivNode, $summaryNode);
-            $aDivNode->setAttributeNS("","style", "border-color: black; ".
-                "border-style:solid; border-width:3px; margin-bottom:10px;".
-                "padding: 5px; background-color: white; ".
-                "text-align:$text_align;");
+            $summary_toggle_node = $this->createSummaryAndToggleNodes($dom,
+                $text_align, $body, $summary_string, $cache_item);
         } else {
-            $aDivNode = $first_child;
+            $summary_toggle_node = $first_child;
+        }
+        if(isset($cache_item[self::KEYWORD_LINKS]) &&
+            count($cache_item[self::KEYWORD_LINKS]) > 0) {
+            $keyword_node = $this->createDomBoxNode($dom, $text_align,
+                "zIndex: 1");
+            $text_node = $dom->createTextNode("Z@key_links@Z");
+            $keyword_node->appendChild($text_node);
+            $keyword_node = $body->insertBefore($keyword_node,
+                $summary_toggle_node);
+            $set_key_links = true;
+        } else {
+            $keyword_node = $summary_toggle_node;
+            $set_key_links = false;
+        }
+
+        if(in_array("version", $ui_flags)) {
+            $version_node =
+                $this->createDomBoxNode($dom, $text_align, "zIndex: 1");
+            $textNode = $dom->createTextNode(
+                tl('search_controller_cached_version', "Z@url@Z", $date));
+            $version_node->appendChild($textNode);
+            $brNode = $dom->createElement('br');
+            $version_node->appendChild($brNode);
+            $this->addCacheJavascriptTags($dom, $version_node);
+            $version_node = $body->insertBefore($version_node, $keyword_node);
+        } else {
+            $version_node = $keyword_node;
         }
-        $divNode = $dom->createElement('div');
-
-        $divNode = $body->insertBefore($divNode, $aDivNode);
-        $divNode->setAttributeNS("", "style","zIndex: 1");
-        $divNode->setAttributeNS("", "style", "border-color: black; ".
-            "border-style:solid; border-width:3px;margin-bottom:10px;".
-            "padding: 5px; background-color: white; text-align:$text_align;");
-
-        $textNode = $dom->createTextNode(tl('search_controller_cached_version',
-            "Z@url@Z", $date));
-        $divNode->appendChild($textNode);
-        $brNode = $dom->createElement('br');
-        $divNode->appendChild($brNode);
-        $this->addCacheJavascriptTags($dom, $divNode);

         //UI for showing history
         if(in_array("history", $ui_flags)) {
-            $history_div = $this->historyUI($crawl_time, $all_crawl_times,
-                $divNode, $dom, $terms, $hist_ui_open, $url);
+            $history_node = $this->historyUI($crawl_time, $all_crawl_times,
+                $version_node, $dom, $terms, $hist_ui_open, $url);
         } else {
-            $history_div = $dom->createElement('div');
+            $history_node = $dom->createElement('div');
         }

-        //ui for extracted summaries
-        if(in_array("summaries", $ui_flags)) {
-            $aNode = $dom->createElement("a");
-            $aTextNode =
-                $dom->createTextNode(tl('search_controller_header_summaries'));
-            $toggle_code = "javascript:".
-                "summary_show = (summary_show != 'block') ? 'block' : 'none';".
-                "summary_pid = elt('summary-page-id');".
-                "summary_pid.style.display = summary_show;";
-            $aNode->setAttributeNS("", "onclick", $toggle_code);
-            $aNode->setAttributeNS("", "style", "zIndex: 1");
-            $aNode->setAttributeNS("", "style", "text-decoration: underline; ".
-                "cursor: pointer");
-
-            $aNode->appendChild($aTextNode);
-
-            $aDivNode->appendChild($aNode);
-        }
-
-        if($history_div) {
-            $divNode->appendChild($history_div);
+        if($history_node) {
+            $version_node->appendChild($history_node);
         }

         $body = $this->markChildren($body, $words, $dom);

-        $newDoc = $dom->saveHTML();
+        $new_doc = $dom->saveHTML();
         if(substr($url, 0, 7) != "record:") {
             $url = "<a href='$url'>$url</a>";
         }
-        $newDoc = str_replace("Z@url@Z", $url, $newDoc);
+        $new_doc = str_replace("Z@url@Z", $url, $new_doc);
         $colors = array("yellow", "orange", "gray", "cyan");
         $color_count = count($colors);

@@ -1702,15 +1683,113 @@ class SearchController extends Controller implements CrawlConstants
                     "/$word/i", '', $mark_prefix);
                 }
                 $match = $mark_prefix.$word;
-                $newDoc = preg_replace("/$match/i",
+                $new_doc = preg_replace("/$match/i",
                     '<span style="background-color:'.
-                    $colors[$i].'">$0</span>', $newDoc);
+                    $colors[$i].'">$0</span>', $new_doc);
                 $i = ($i + 1) % $color_count;
-                $newDoc = preg_replace("/".$mark_prefix."/", "", $newDoc);
+                $new_doc = preg_replace("/".$mark_prefix."/", "", $new_doc);
             }
         }
+        if($set_key_links) {
+            $new_doc = $this->addKeywordLinks($new_doc, $cache_item);
+        }

-        return $newDoc;
+        return $new_doc;
+    }
+
+    /**
+     *  Function used to add links for keyword searches in keyword_links
+     *  array of $cache_item to the text of the $web_page we are going to
+     *  display the cache of as part of a pache page request
+     *
+     *  @param string $web_page to add links to
+     *  @param array $cache_item original cache item web page generated from
+     *  @return string modified web page
+     */
+    function addKeywordLinks($web_page, &$cache_item)
+    {
+        $base = $this->baseLink()."&its=".$this->getIndexTimestamp();
+        $link_list = "<ul>";
+        foreach($cache_item[self::KEYWORD_LINKS] as $keywords => $text) {
+            $keywords = urlencode($keywords);
+            $link_list .= "<li><a href='$base&q=$keywords' rel='nofollow'>".
+                "$text</a></li>";
+        }
+        $link_list .= "</ul>";
+        $web_page = str_replace("Z@key_links@Z", $link_list, $web_page);
+        return $web_page;
+    }
+
+    /**
+     *  Creates the toggle link and hidden div for extracted header and
+     *  summary element on cache pages
+     *
+     * @param DOMDocument $dom used to create new nodes to add to body object
+     *      for page
+     * @param string $text_align whether rtl or ltr language
+     * @param DOMElement $body represent body of cached page
+     * @param string $summary_string header and summary that were extraced
+     * @param array $cache_item contains infor about the cached item
+     * @return DOMElement a div node with toggle link and hidden div
+     */
+    function createSummaryAndToggleNodes($dom, $text_align, $body,
+        $summary_string, $cache_item)
+    {
+        $first_child = $body->firstChild;
+        $summaryNode = $this->createDomBoxNode($dom, $text_align,
+            "display:none;", 'pre');
+        $summaryNode->setAttributeNS("","id", "summary-page-id");
+        $summaryNode = $body->insertBefore($summaryNode, $first_child);
+
+        if(isset($cache_item[self::HEADER])) {
+            $summary_string = $cache_item[self::HEADER]."\n".
+                $summary_string;
+        }
+        $textNode = $dom->createTextNode($summary_string);
+        $summaryNode->appendChild($textNode);
+
+        $scriptNode = $dom->createElement('script');
+        $scriptNode = $body->insertBefore($scriptNode, $summaryNode);
+        $textNode = $dom->createTextNode("var summary_show = 'none';");
+        $scriptNode->appendChild($textNode);
+
+        $aDivNode = $this->createDomBoxNode($dom, $text_align);
+        $aNode = $dom->createElement("a");
+        $aTextNode =
+            $dom->createTextNode(tl('search_controller_header_summaries'));
+        $toggle_code = "javascript:".
+            "summary_show = (summary_show != 'block') ? 'block' : 'none';".
+            "summary_pid = elt('summary-page-id');".
+            "summary_pid.style.display = summary_show;";
+        $aNode->setAttributeNS("", "onclick", $toggle_code);
+        $aNode->setAttributeNS("", "style", "zIndex: 1;".
+            "text-decoration: underline; cursor: pointer");
+
+        $aNode->appendChild($aTextNode);
+
+        $aDivNode->appendChild($aNode);
+        $body->insertBefore($aDivNode, $summaryNode);
+        return $aDivNode;
+    }
+
+    /**
+     * Creates a bordered tag (usually div) in which to put meta content on a
+     * page when it is displayed
+     *
+     * @param DOMDocument $dom representing cache page
+     * @param string $text_align whether doc is ltr or rtl
+     * @param string $more_styles any additional styles for box
+     * @param string $tag base tag of box (default div)
+     * @return DOMElement of styled box
+     */
+    function createDomBoxNode($dom, $text_align, $more_styles="", $tag="div")
+    {
+        $divNode = $dom->createElement($tag);
+        $divNode->setAttributeNS("","style", "border-color: black; ".
+            "border-style:solid; border-width:3px; margin-bottom:10px;".
+            "padding: 5px; background-color: white; ".
+            "text-align:$text_align; $more_styles");
+        return $divNode;
     }

     /**
@@ -1912,15 +1991,7 @@ class SearchController extends Controller implements CrawlConstants
                     $url_encoded = urlencode($arr[3]);
                     $link_text = $dom->createTextNode("$arr[0] $arr[1] ".
                             "$arr[2]");
-                    if(isset($_SESSION['USER_ID'])) {
-                        $user = $_SESSION['USER_ID'];
-                    } else if (isset($_SERVER['REMOTE_ADDR'])) {
-                        $user = $_SERVER['REMOTE_ADDR'];
-                    } else {
-                        $user = "127.0.0.1";
-                    }
-                    $csrf_token = $this->generateCSRFToken($user);
-                    $link = "?YIOOP_TOKEN=$csrf_token&c=search&a=cache&".
+                    $link = $this->baseAddress."&a=cache&".
                         "q=$terms&arg=$url_encoded&its=$arr[4]&hist_open=true";
                     $link_dom = $dom->createElement("a");
                         $link_dom->setAttributeNS("", "href", $link);
@@ -1939,6 +2010,27 @@ class SearchController extends Controller implements CrawlConstants
         return $d1;
     }

+    /**
+     * Used to create the base link for links to be displayed on caches
+     * of web pages this link points to yioop because links on cache pages
+     * are to other cache pages
+     *
+     * @return string desired base link
+     */
+    function baseLink()
+    {
+        if(isset($_SESSION['USER_ID'])) {
+            $user = $_SESSION['USER_ID'];
+        } else if (isset($_SERVER['REMOTE_ADDR'])) {
+            $user = $_SERVER['REMOTE_ADDR'];
+        } else {
+            $user = "127.0.0.1";
+        }
+        $csrf_token = $this->generateCSRFToken($user);
+        $link = "?".CSRF_TOKEN."=$csrf_token&c=search";
+        return $link;
+    }
+
     /**
      * Display links based on selected year and month in History UI
      * @param array years is an array storing years associated with all indexes
@@ -2003,18 +2095,17 @@ class SearchController extends Controller implements CrawlConstants
         $s1->appendChild($m);
         $d1->appendChild($s1);
         $d1->setAttributeNS("", "style", "display:none");
-        $script = $dom->createElement("script");
-        $script->setAttributeNS("","src", NAME_SERVER."/scripts/basic.js");
-        $d1->appendChild($script);
-        $script = $dom->createElement("script");
-        $script->setAttributeNS("","src", NAME_SERVER."/scripts/history.js");
-        $d1->appendChild($script);
+        $this->addCacheJavascriptTags($dom, $d1);

         return $d1;
     }

     /**
+     * Add to supplied node subnodes containing script tags for javascript
+     * libraries used to display cache pages
      *
+     * @param DOMDocument $dom used to create new nodes
+     * @param DomElement &$node what to add script node to
      */
     function addCacheJavascriptTags($dom, &$node)
     {
diff --git a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
index 1028ed8fe..a7e128583 100644
--- a/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
+++ b/lib/archive_bundle_iterators/text_archive_bundle_iterator.php
@@ -321,26 +321,42 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
         $info = array();
         $info[self::START_PARTITION] = false;
         if(!$this->checkFileHandle() || $this->checkEof()) {
-            $this->current_partition_num++;
-            if($this->current_partition_num >= $this->num_partitions) {
-                $this->end_of_iterator = true;
-                return false;
-            }
-            $this->fileOpen(
-                $this->partitions[$this->current_partition_num]);
-            if($this->switch_partition_callback_name != NULL) {
-                $callback_name = $this->switch_partition_callback_name;
-                $result = $this->$callback_name();
-            }
-            $info[self::START_PARTITION] = true;
+            $this->updatePartition($info);
         }
         $info[self::INI] = $this->ini;
         $info[self::HEADER] = $this->header;
         $info[self::ARC_DATA] = $this->updateBuffer("", true);
+        if(!$info[self::ARC_DATA]) {
+            $this->updatePartition($info);
+            $info[self::ARC_DATA] = $this->updateBuffer("", true);
+        }
         $this->saveCheckpoint();
         return $info;
     }

+    /**
+     *  Helper function for nextChunk to advance the parition if we are
+     *  at the end of the current archive file
+     *
+     *  @param &$info a struct with data about current chunk. will up start
+     *      partition flag
+     */
+    function updatePartition(&$info)
+    {
+        $this->current_partition_num++;
+        if($this->current_partition_num >= $this->num_partitions) {
+            $this->end_of_iterator = true;
+            return false;
+        }
+        $this->fileOpen(
+            $this->partitions[$this->current_partition_num]);
+        if($this->switch_partition_callback_name != NULL) {
+            $callback_name = $this->switch_partition_callback_name;
+            $result = $this->$callback_name();
+        }
+        $info[self::START_PARTITION] = true;
+    }
+
     /**
      * Gets the next at most $num many docs from the iterator. It might return
      * less than $num many documents if the partition changes or the end of the
@@ -362,6 +378,8 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
                     $this->fileClose();
                 }
                 if(!$this->iterate_dir) { //fetcher local case
+                    $this->current_offset = self::BUFFER_SIZE +
+                        self::MAX_RECORD_SIZE;
                     break;
                 }
                 $this->current_partition_num++;
@@ -520,13 +538,12 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
         if($buffer == "") {
             if(!$this->checkFileHandle()) { return false; }
             $success = 1;
+            $seek_pos = $this->buffer_block_num * self::BUFFER_SIZE;
             if($this->compression == "gzip") {
-                $success = gzseek($this->fh, $this->buffer_block_num *
-                    self::BUFFER_SIZE);
+                $success = gzseek($this->fh, $seek_pos);
             }
             if($this->compression == "plain") {
-                $success = fseek($this->fh, $this->buffer_block_num *
-                    self::BUFFER_SIZE);
+                $success = fseek($this->fh, $seek_pos);
             }
             if($success == -1 || !$this->checkFileHandle()
                 || $this->checkEof()) { return false; }
@@ -609,6 +626,7 @@ class TextArchiveBundleIterator extends ArchiveBundleIterator
                 $eof = feof($this->fh);
             break;
         }
+        return $eof;
     }

     /**
diff --git a/views/search_view.php b/views/search_view.php
index c13d328d9..8b89e25d0 100755
--- a/views/search_view.php
+++ b/views/search_view.php
@@ -325,7 +325,7 @@ class SearchView extends View implements CrawlConstants
             } //end foreach
             $this->paginationHelper->render(
                 $data['PAGING_QUERY']."&amp;".CSRF_TOKEN."=".
-                    $data[CSRF_TOKEN]."&amp;its=".$page[self::CRAWL_TIME],
+                    $data[CSRF_TOKEN]."&amp;its=".$data['its'],
                 $data['LIMIT'], $data['RESULTS_PER_PAGE'], $data['TOTAL_ROWS']);
             ?>
ViewGit