Fixes broken unit tests related to processors (improves Pdf and Text processors) and IndexShard, a=chris

Chris Pollett [2019-06-05 22:Jun:th]

Fixes broken unit tests related to processors (improves Pdf and Text processors) and IndexShard, a=chris

Filename
src/controllers/SearchController.php
src/controllers/components/CrawlComponent.php
src/executables/Fetcher.php
src/library/IndexShard.php
src/library/PartialZipArchive.php
src/library/UrlParser.php
src/library/processors/HtmlProcessor.php
src/library/processors/PdfProcessor.php
src/library/processors/SvgProcessor.php
src/library/processors/TextProcessor.php
src/library/summarizers/ScrapeSummarizer.php
src/library/summarizers/Summarizer.php
src/locale/ar/configure.ini
src/locale/bn/configure.ini
src/locale/de/configure.ini
src/locale/en_US/configure.ini
src/locale/es/configure.ini
src/locale/fa/configure.ini
src/locale/fr_FR/configure.ini
src/locale/he/configure.ini
src/locale/hi/configure.ini
src/locale/in_ID/configure.ini
src/locale/it/configure.ini
src/locale/ja/configure.ini
src/locale/kn/configure.ini
src/locale/ko/configure.ini
src/locale/nl/configure.ini
src/locale/pl/configure.ini
src/locale/pt/configure.ini
src/locale/ru/configure.ini
src/locale/te/configure.ini
src/locale/th/configure.ini
src/locale/tr/configure.ini
src/locale/vi_VN/configure.ini
src/locale/zh_CN/configure.ini
src/views/elements/PageoptionsElement.php
tests/IndexShardTest.php
tests/PdfProcessorTest.php

diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php
index 765c1f47f..dab0de7ca 100755
--- a/src/controllers/SearchController.php
+++ b/src/controllers/SearchController.php
@@ -1591,6 +1591,10 @@ class SearchController extends Controller implements CrawlConstants
                 $crawl_item[self::URL]);
             $phrase_string = $host_words . " .. " . $crawl_item[self::TITLE] .
                 " ..  ". $path_words . " .. ". $crawl_item[self::DESCRIPTION];
+            if (empty($crawl_item[self::LANG])) {
+                $crawl_item[self::LANG] =
+                    L\guessLocaleFromString($phrase_string);
+            }
             $word_lists = PhraseParser::extractPhrasesInLists(
                 $phrase_string, $crawl_item[self::LANG]);
             $len = strlen($phrase_string);
diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index ecfa2ba0d..25ff84f55 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1123,7 +1123,7 @@ class CrawlComponent extends Component implements CrawlConstants
             }
         }
         $seed_info = $crawl_model->getSeedInfo();
-        $data['RECRAWL_FREQS'] = [-1=>tl('crawl_component_recrawl_never'),
+        $data['RECRAWL_FREQS'] = [-1 => tl('crawl_component_recrawl_never'),
             1=>tl('crawl_component_recrawl_1day'),
             2=>tl('crawl_component_recrawl_2day'),
             3=>tl('crawl_component_recrawl_3day'),
@@ -1256,7 +1256,9 @@ class CrawlComponent extends Component implements CrawlConstants
                 $seed_info['indexing_plugins']['plugins']
                 : [];
         foreach ($parent->getIndexingPluginList() as $plugin) {
-            if ($plugin == "") {continue; }
+            if ($plugin == "") {
+                continue;
+            }
             $plugin_name = ucfirst($plugin);
             $data['INDEXING_PLUGINS'][$plugin_name]['checked'] =
                 (in_array($plugin_name, $included_plugins)) ?
@@ -1518,6 +1520,11 @@ class CrawlComponent extends Component implements CrawlConstants
             }
             L\convertUtf8IfNeeded($site, self::PAGE, self::ENCODING);
             $data['TESTPAGE'] = $site[self::PAGE];
+            if (empty(PageProcessor::$mime_processor[$site[self::TYPE]])) {
+                return $parent->redirectWithMessage(
+                    tl('crawl_component_page_options_no_processor'),
+                    ["option_type"], true);
+            }
             $processor_name = PageProcessor::$mime_processor[$site[self::TYPE]];
             $plugin_processors = [];
             if (isset($seed_info['indexing_plugins']['plugins'])) {
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 482a00ce8..6837951b0 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1770,7 +1770,7 @@ class Fetcher implements CrawlConstants
                 $processor = $this->pageProcessor($type);
                 if (!$processor) {
                     L\crawlLog("No page processor for mime type: ".$type);
-                    L\crawlLog("Not processing: ".$site[self::URL]);
+                    L\crawlLog("Not processing: " . $site[self::URL]);
                     continue;
                 }
                 $text_data = $processor->text_data;
diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php
index d4110f010..9b20967a6 100644
--- a/src/library/IndexShard.php
+++ b/src/library/IndexShard.php
@@ -768,6 +768,20 @@ class IndexShard extends PersistentStructure implements CrawlConstants
                 $id_pos + self::DOC_ID_LEN, ($num_keys - 3) * $doc_key_len);
             list($item[self::DESCRIPTION_SCORES], $item[self::USER_RANKS]) =
                 $this->unpackAuxiliaryDocumentKeys($aux_key_string);
+        } else if ($num_keys == 3) {
+            $test_id = rtrim($doc_id, "\x00");
+            if (strlen($test_id) + self::DOC_KEY_LEN == strlen($doc_id) ) {
+                $doc_id = $test_id;
+            }
+        } else if ($num_keys == 2) {
+            $test_id = rtrim($doc_id, "\xFF");
+            if (strlen($test_id) + self::DOC_KEY_LEN == strlen($doc_id)) {
+                $doc_id = $test_id;
+                $test_id = rtrim($doc_id, "\x00");
+                if (strlen($test_id) + self::DOC_KEY_LEN == strlen($doc_id)) {
+                    $doc_id = $test_id;
+                }
+            }
         }
         $occurrences = $this->weightedCount($position_list, $is_doc,
             $title_length, $item[self::DESCRIPTION_SCORES]);
@@ -839,8 +853,10 @@ class IndexShard extends PersistentStructure implements CrawlConstants
      *
      * @param array $position_list positions of term in item
      * @param bool $is_doc whether the item is a document or a link
-     * @param int $title_length
-     * @param array $position_scores
+     * @param int $title_length position in position list at which point
+     *  no longer in title of original doc
+     * @param array $position_scores pairs position => weight
+     *  saying how much a word at a given position range is worth
      * @return array asscoiative array of document_part => weight count
      * of occurrences of term in
      *
@@ -940,7 +956,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants
         }
         $num_term_occurrences = $num_doc_or_links *
             $num_docs/($total_docs_or_links);
-
         $IDF = log(($num_docs - $num_term_occurrences + $half) /
             ($num_term_occurrences + $half));
         $item[self::RELEVANCE] += $half * $IDF * $pre_relevance * $type_weight;
@@ -1369,11 +1384,16 @@ class IndexShard extends PersistentStructure implements CrawlConstants
             $id = substr($this->doc_infos, $i + $doc_key_len,
                 $doc_id_len); /* id is only three keys of the list of keys,
                 remaining keys used for ranker */
+            $test_id = rtrim($id, "\x00");
+            if (strlen($test_id) + self::DOC_KEY_LEN == strlen($id)) {
+                $id = $test_id;
+            }
             if (isset($docid_offsets[$id])) {
                 charCopy(packInt($docid_offsets[$id]), $this->doc_infos,
                     $i, $posting_len);
             } else if ($offset == self::NEEDS_OFFSET_FLAG) {
-                crawlLog("Document:".toHexString($id)." still needs offset");
+                crawlLog("Document:" . toHexString($id) .
+                    " still needs offset");
             }
         }
     }
diff --git a/src/library/PartialZipArchive.php b/src/library/PartialZipArchive.php
index 685092811..1c2ce43ed 100644
--- a/src/library/PartialZipArchive.php
+++ b/src/library/PartialZipArchive.php
@@ -87,12 +87,18 @@ class PartialZipArchive
             if (!$sub_file) {
                 continue;
             }
+            /*
+              Info for offsets can be found at:
+              https://en.wikipedia.org/wiki/Zip_(file_format)#Local_file_header
+              Notice 4 bytes are consumed when do explode, so value below are
+              all 4 less than there
+             */
             $len_string = substr($sub_file, 22, 2);
             $file_name_len = (ord($len_string[1]) << 8) + ord($len_string[0]);
             $len_string = substr($sub_file, 24, 2);
             $extra_field_len = (ord($len_string[1]) << 8) + ord($len_string[0]);
             $file_start = 26 + $file_name_len + $extra_field_len;
-            $len_string = substr($sub_file, 18, 4);
+            $len_string = substr($sub_file, 14, 4);
             $file_size = (((((ord($len_string[3]) << 8) +
                 ord($len_string[2])) << 8) + ord($len_string[1])) << 8) +
                 ord($len_string[0]);
diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php
index 36b01ee09..fd22f9940 100755
--- a/src/library/UrlParser.php
+++ b/src/library/UrlParser.php
@@ -561,7 +561,7 @@ class UrlParser
         if ($url == "") {
             return $default;
         }
-        $url_parts = @parse_url(urlencode($url));
+        $url_parts = @parse_url(str_replace("%3F", "?", urlencode($url)));
         if (!isset($url_parts['path'])) {
             return $default;
         } else if ($url[strlen($url)-1] == "/" || $url[strlen($url)-1] == "\\"){
diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php
index 82d14fed2..0cc9f138d 100755
--- a/src/library/processors/HtmlProcessor.php
+++ b/src/library/processors/HtmlProcessor.php
@@ -121,7 +121,6 @@ class HtmlProcessor extends TextProcessor
                 }
                 $summary[self::LANG] = self::lang($dom,
                     $summary[self::TITLE], $url);
-                echo $summary[self::LANG];
                 $description_dom = $dom;
                 if (!empty($scraper)) {
                     $scrape_results = ScraperManager::applyScraperRules(
@@ -294,21 +293,16 @@ class HtmlProcessor extends TextProcessor
     public static function title($dom)
     {
         $xpath = new \DOMXPath($dom);
-        $titles = $xpath->evaluate("/html//title");
+        $title_parts = ["/html/head/title",
+            "/html//title", "/html//h1", "/html//h2",
+            "/html//h3", "/html//h4", "/html//h5", "/html//h6"];
         $title = "";
-        foreach ($titles as $pre_title) {
-                $title .= $pre_title->nodeValue;
-        }
-        if ($title == "") {
-            $title_parts = ["/html//h1", "/html//h2", "/html//h3",
-                "/html//h4", "/html//h5", "/html//h6"];
-            foreach ($title_parts as $part) {
-                $doc_nodes = $xpath->evaluate($part);
-                foreach ($doc_nodes as $node) {
-                    $title .= " .. " . $node->nodeValue;
-                    if (strlen($title) > self::MAX_TITLE_LEN) {
-                        break 2;
-                    }
+        foreach ($title_parts as $part) {
+            $doc_nodes = $xpath->evaluate($part);
+            foreach ($doc_nodes as $node) {
+                $title =  trim($node->nodeValue);
+                if (!empty($title)) {
+                    break 2;
                 }
             }
         }
@@ -349,85 +343,6 @@ class HtmlProcessor extends TextProcessor
         $body = preg_replace("/\s+/", " ", $body);
         return mb_substr($body, 0, self::$max_description_len);
     }
-    /**
-     * Returns descriptive text concerning a webpage based on its document
-     * object
-     *
-     * @param object $dom   a document object to extract a description from.
-     * @param string $page original page string to extract description from
-     * @return string a description of the page
-     */
-    public static function description($dom, $page)
-    {
-        $xpath = new \DOMXPath($dom);
-        $metas = $xpath->evaluate("/html//meta");
-        $description = "";
-        //look for a meta tag with a description
-        foreach ($metas as $meta) {
-            if (stristr($meta->getAttribute('name'), "description")) {
-                $description .= " .. " . $meta->getAttribute('content');
-            }
-        }
-        if (self::$max_description_len > 2 * C\MAX_DESCRIPTION_LEN) {
-            /* if don't need to summarize much, take meta description
-               from above code, then concatenate body of doc
-               after stripping tags, return result
-             */
-            $description .= "\n" . self::crudeDescription($page);
-            return $description;
-        }
-        /*
-          concatenate the contents of then additional dom elements up to
-          the limit of description length. Choose tags in order of likely
-          importance to this doc
-        */
-        $page_parts = ["/html//p[1]",
-            "/html//div[1]", "/html//p[2]", "/html//div[2]", "/html//p[3]",
-            "/html//div[3]", "/html//p[4]", "/html//div[4]",
-            "/html//td", "/html//li", "/html//dt", "/html//dd",
-            "/html//pre", "/html//a", "/html//article",
-            "/html//section", "/html//cite"];
-        $para_data = [];
-        $len = 0;
-        foreach ($page_parts as $part) {
-            $doc_nodes = $xpath->evaluate($part);
-            foreach ($doc_nodes as $node) {
-                if ($part == "/html//a") {
-                    $content = $node->getAttribute('href')." = ";
-                    $add_len  = min(self::$max_description_len / 2,
-                        mb_strlen($content));
-                    $para_data[$add_len][] = mb_substr($content, 0, $add_len);
-                }
-                $node_text = self::domNodeToString($node);
-                $add_len  = min(self::$max_description_len / 2,
-                    mb_strlen($node_text));
-                $para_data[$add_len][] = mb_substr($node_text, 0, $add_len);
-                $len += $add_len;
-                if ($len > self::$max_description_len) {
-                    break 2;
-                }
-                if (in_array($part, ["/html//p[1]", "/html//div[1]",
-                    "/html//div[2]", "/html//p[2]", "/html//p[3]",
-                    "/html//div[3]", "/html//div[4]", "/html//p[4]"])){
-                    break;
-                }
-            }
-        }
-        krsort($para_data);
-        foreach ($para_data as $add_len => $data) {
-            if (!isset($first_len)) {
-                $first_len = $add_len;
-            }
-            foreach ($data as $datum) {
-                $description .= " .. ". $datum;
-            }
-            if ($first_len > 3 * $add_len) {
-                break;
-            }
-        }
-        $description = preg_replace("/(\s)+/u", " ",  $description);
-        return $description;
-    }
     /**
      * Extracts are location of refresh urls from the meta tags of html page
      * in site
diff --git a/src/library/processors/PdfProcessor.php b/src/library/processors/PdfProcessor.php
index dea673e6f..32752e11d 100755
--- a/src/library/processors/PdfProcessor.php
+++ b/src/library/processors/PdfProcessor.php
@@ -76,40 +76,51 @@ class PdfProcessor extends TextProcessor
     {
         $text = "";
         if (is_string($page)) {
-            $encoding = self::getEncoding($page);
+            list($encoding, $title) = self::getEncodingTitle($page);
             $text =  self::getText($page, $encoding);
         }
         if ($text == "") {
             $text = $url;
         }
         $summary = parent::process($text, $url);
+        if ($title) {
+            $summary[self::TITLE] = $title;
+        }
         return $summary;
     }
     /**
      * Returns the first encoding format information found in the PDF document
      *
      * @param string $pdf_string a string representing the PDF document
-     * @return string which of the default (if any) PDF encoding formats is
-     *      being used: MacRomanEncoding, WinAnsiEncoding, PDFDocEncoding, etc.
+     * @return array [encoding, title] which of the default (if any) PDF
+     * encoding formats is being used: MacRomanEncoding, WinAnsiEncoding,
+     * PDFDocEncoding, etc as well as a title for the document if found
+     *
      */
-    public static function getEncoding($pdf_string)
+    public static function getEncodingTitle($pdf_string)
     {
         $len = strlen($pdf_string);
         $cur_pos = 0;
         $out = "";
         $i = 0;
         set_error_handler(null);
-        while($cur_pos < $len) {
+        $encoding = "";
+        $title = "";
+        while($cur_pos < $len && (!$encoding || !$title)) {
             list($cur_pos, $object_string) =
                 self::getNextObject($pdf_string, $cur_pos);
             $object_dictionary = self::getObjectDictionary($object_string);
             if (preg_match("/\/(\w+Encoding)\b/", $object_dictionary,
                 $match) != false) {
-                return $match[1];
+                $encoding = $match[1];
+            }
+            if (preg_match("/\/Title\(([^\)]+)\)/", $object_dictionary,
+                $match) != false) {
+                $title = $match[1];
             }
         }
         set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
-        return "";
+        return [$encoding, $title];
     }
     /**
      * Gets the text out of a PDF document
@@ -127,30 +138,38 @@ class PdfProcessor extends TextProcessor
         $out = "";
         $i = 0;
         set_error_handler(null);
+        $state = "text";
         while($cur_pos < $len) {
             list($cur_pos, $object_string) =
                 self::getNextObject($pdf_string, $cur_pos);
             $object_dictionary = self::getObjectDictionary($object_string);
+            if (self::objectDictionaryHas(
+                $object_dictionary, ["Type", "Font", "FontDescriptor"])) {
+                $state = "font";
+                continue;
+            }
             if (!self::objectDictionaryHas(
                 $object_dictionary, ["Image", "Catalog"])) {
                 $stream_data =
                     rtrim(ltrim(self::getObjectStream($object_string)));
+                if ($state == 'font') {
+                    $state == 'text';
+                    continue;
+                }
                 if (self::objectDictionaryHas(
                     $object_dictionary, ["FlateDecode"])) {
                     $stream_data = @gzuncompress($stream_data);
                     if (strpos($stream_data, "PS-AdobeFont")) {
-                        $out .= $stream_data;
-                        break;
+                        $out .= $stream_data . "\n\n";
                     }
                     $text = self::parseText($stream_data, $encoding);
-                    $out .= $text;
+                    $out .= $text. "\n\n";
                 } else {
                     $text = self::parseText($stream_data, $encoding);
                     if (strpos($stream_data, "PS-AdobeFont")){
-                        $out .= $stream_data;
-                        break;
+                        $out .= $stream_data . "\n\n";
                     }
-                    $out .= $text;
+                    $out .= $text . "\n\n";
                 }
             }
         }
@@ -241,14 +260,14 @@ class PdfProcessor extends TextProcessor
             $data);
         $len = strlen($data);
         $out = "";
-        $escape_flag =false;
+        $escape_flag = false;
         while($cur_pos < $len) {
             $cur_char = $data[$cur_pos];
             if ($cur_char == '[' && !$escape_flag) {
                 list($cur_pos, $text) = self::parseBrackets($data, $cur_pos,
                     $encoding);
                 $cur_pos--;
-                $out .= " " . $text;
+                $out .= " ". $text;
             }
             if ($cur_char == '\\') {
                 $escape_flag = true;
@@ -276,7 +295,7 @@ class PdfProcessor extends TextProcessor
         $len = strlen($data);
         $out = "";
         $escape_flag =false;
-        $cur_char="";
+        $cur_char = "";
         while($cur_pos < $len && ($cur_char != "]")) {
             $cur_char = $data[$cur_pos];
             if ($cur_char == '(') {
diff --git a/src/library/processors/SvgProcessor.php b/src/library/processors/SvgProcessor.php
index fc8654583..66fe6760e 100644
--- a/src/library/processors/SvgProcessor.php
+++ b/src/library/processors/SvgProcessor.php
@@ -46,7 +46,6 @@ use seekquarry\yioop\library\UrlParser;
  */
 class SvgProcessor extends TextProcessor
 {
-    const MAX_THUMB_LEN = 5000;
     /**
      * Set-ups the any indexing plugins associated with this page
      * processor
@@ -79,7 +78,6 @@ class SvgProcessor extends TextProcessor
      *    used to canonicalize relative links
      *
      * @return array  a summary of the contents of the page
-     *
      */
     public function process($page, $url)
     {
@@ -90,15 +88,15 @@ class SvgProcessor extends TextProcessor
                 $dom = self::dom($page);
             }
             if ($dom !== false && isset($dom->documentElement)) {
-                $summary[self::TITLE] = "";
-                $summary[self::DESCRIPTION] = self::description($dom);
+                $summary[self::TITLE] = self::title($dom, $page);
+                $summary[self::DESCRIPTION] = self::description($dom, $page);
                 $summary[self::LINKS] = [];
                 $summary[self::PAGE] = "<!DOCTYPE html>" .
                     "<html><body><div><img src='data:image/svg+xml;base64," .
                     base64_encode($page)."' alt='".$summary[self::DESCRIPTION].
                     "' /></div></body></html>";
-                if (strlen($page) < self::MAX_THUMB_LEN) {
-                    $thumb_string = self::createThumb($dom);
+                if (strlen($page) < C\PAGE_RANGE_REQUEST) {
+                    $thumb_string = self::createThumb($dom, $page);
                     $summary[self::THUMB] = 'data:image/svg+xml;base64,'.
                         base64_encode($thumb_string);
                 }
@@ -112,9 +110,11 @@ class SvgProcessor extends TextProcessor
      * Used to create an svg thumbnail from a dom object
      *
      * @param object $dom a dom svg image object
-     *
+     * @param string $page content of file to fall back on in case dom
+     *  manipulation fails
+     * @return string containing svg image of thumb
      */
-    public static function createThumb($dom)
+    public static function createThumb($dom, $page)
     {
         $svg = $dom->documentElement;
         if ($svg->hasAttribute("width")) {
@@ -134,7 +134,13 @@ class SvgProcessor extends TextProcessor
         if (!$svg->hasAttribute("viewBox")) {
             $svg->setAttributeNS("", "viewBox", "0 0 $width $height");
         }
-        return $dom->saveXML();
+        $thumb_string = $dom->saveXML();
+        if (!empty($thumb_string)) {
+            return $thumb_string;
+        }
+        $thumb_string = preg_replace('/\<svg\s(^\>+)\>/',
+            "<svg width='150px' height='150px' >", $page);
+        return $thumb_string;
     }
     /**
      * Return a document object based on a string containing the contents of
@@ -146,27 +152,40 @@ class SvgProcessor extends TextProcessor
      */
     public static function dom($page)
     {
-        $dom = new \DOMDocument();
-        set_error_handler(null);
-        @$dom->loadXML($page);
-        set_error_handler(C\NS_CONFIGS . "yioop_error_handler");
-        return $dom;
+        return L\getDomFromString($page);
     }
     /**
-     * Returns html head title of a webpage based on its document object
+     * Returns title of a svg page based on its document object
      *
-     * @param object $dom   a document object to extract a title from.
+     * @param object $dom a document object to extract a title from.
+     * @param string $page content of file to fall back on in case xpath
+     *  doesn't work
      * @return string  a title of the page
      *
      */
-    public static function title($dom)
+    public static function title($dom, $page)
     {
-        $sites = [];
         $xpath = new \DOMXPath($dom);
-        $titles = $xpath->evaluate("/svg//desc");
+        $title_parts = ["/svg//title", "/svg//desc"];
         $title = "";
-        foreach ($titles as $pre_title) {
-            $title .= $pre_title->textContent;
+        foreach ($title_parts as $part) {
+            $doc_nodes = $xpath->evaluate($part);
+            foreach ($doc_nodes as $node) {
+                $title =  trim($node->nodeValue);
+                if (!empty($title)) {
+                    break 2;
+                }
+            }
+        }
+        if (empty($title)) {
+            list(, $title) = parent::getBetweenTags($page, 0, "<title",
+            "</title");
+            return strip_tags("<title" . $title . "</title>");
+        }
+        if (empty($title)) {
+            list(, $title) = parent::getBetweenTags($page, 0, "<desc",
+            "</desc");
+            return strip_tags("<desc" . $title . "</desc>");
         }
         return $title;
     }
@@ -175,9 +194,11 @@ class SvgProcessor extends TextProcessor
      * object
      *
      * @param object $dom a document object to extract a description from.
+     * @param string $page content of file to fall back on in case xpath
+     *  doesn't work
      * @return string a description of the page
      */
-    public static function description($dom)
+    public static function description($dom, $page)
     {
         $sites = [];
         $xpath = new \DOMXPath($dom);
@@ -191,11 +212,21 @@ class SvgProcessor extends TextProcessor
             $doc_nodes = $xpath->evaluate($part);
             foreach ($doc_nodes as $node) {
                 $description .= " ".$node->textContent;
-                if (strlen($description) > self::$max_description_len){
+                if (strlen($description) > self::$max_description_len) {
                     break 2;
                 }
             }
         }
+        if (empty($description)) {
+            list(, $description) = parent::getBetweenTags($page, 0, "<desc",
+            "</desc");
+            return strip_tags("<desc" . $description . "</desc>");
+        }
+        if (empty($description)) {
+            list(, $description) = parent::getBetweenTags($page, 0, "<text",
+            "</text");
+            return strip_tags("<text" . $description . "</text>");
+        }
         $description = mb_ereg_replace("(\s)+", " ",  $description);
         return $description;
     }
diff --git a/src/library/processors/TextProcessor.php b/src/library/processors/TextProcessor.php
index d7e2c6531..19fb8d597 100755
--- a/src/library/processors/TextProcessor.php
+++ b/src/library/processors/TextProcessor.php
@@ -98,14 +98,14 @@ class TextProcessor extends PageProcessor
             $dom = self::dom($remove_styles_page);
             $summary[self::TITLE] = "";
             $summary[self::LANG] = self::calculateLang($remove_styles_page);
+            $summary[self::PAGE] = "<html><body><div><pre>" .
+                strip_tags($remove_styles_page) . "</pre></div></body></html>";
             list($summary[self::DESCRIPTION], $summary[self::WORD_CLOUD],
                 $summary[self::DESCRIPTION_SCORES]) =
-                $this->summarizer->getSummary($dom, $remove_styles_page,
+                $this->summarizer->getSummary($dom, $summary[self::PAGE],
                     $summary[self::LANG]);
             $summary[self::LINKS] = self::extractHttpHttpsUrls(
                 $remove_styles_page);
-            $summary[self::PAGE] = "<html><body><div><pre>" .
-                strip_tags($remove_styles_page) . "</pre></div></body></html>";
         }
         return $summary;
     }
@@ -245,7 +245,9 @@ class TextProcessor extends PageProcessor
             $body_tags = "<frameset><frame><noscript><img><span><b><i><em>".
                 "<strong><h1><h2><h3><h4><h5><h6><p><div>".
                 "<a><table><tr><td><th><dt><dir><dl><dd>";
-            $body = strip_tags($page, $body_tags);
+            $body = "\n\n" . strip_tags($page, $body_tags). "\n\n";
+            $body = preg_replace("/\n\n(.+)\n\n/s", '<div>\n$1\n</div>\n',
+                $body);
             $page = "<html><head>$head</head><body>$body</body></html>";
         }
         $dom = L\getDomFromString($page);
diff --git a/src/library/summarizers/ScrapeSummarizer.php b/src/library/summarizers/ScrapeSummarizer.php
index 27e70fcf9..8d4f085a0 100644
--- a/src/library/summarizers/ScrapeSummarizer.php
+++ b/src/library/summarizers/ScrapeSummarizer.php
@@ -89,11 +89,14 @@ class ScrapeSummarizer extends Summarizer
             $block = trim(strip_tags($block));
             $score_pos = pow(1 + $pos, 0.5);
             if (!empty($block)) {
+                $sentences = self::getSentences($block);
                 $weight = (empty($tag_weights[$tag_name])) ? 1.0 :
                     $tag_weights[$tag_name];
-                $blocks[] = $block;
-                $block_ranks[] = ($weight * log(strlen($block) + 1)) /
-                    $score_pos;
+                foreach ($sentences as $sentence) {
+                    $blocks[] = $sentence;
+                    $block_ranks[] = ($weight * log(strlen($sentence) + 1)) /
+                        $score_pos;
+                }
             }
             if ($pos > 0 && !empty($block_ranks[$changeable_index]) &&
                 $max_score/$score_pos < $block_ranks[$changeable_index]) {
diff --git a/src/library/summarizers/Summarizer.php b/src/library/summarizers/Summarizer.php
index 0d1aa36c9..1113f998f 100644
--- a/src/library/summarizers/Summarizer.php
+++ b/src/library/summarizers/Summarizer.php
@@ -351,6 +351,12 @@ class Summarizer
         $summary = "";
         $summary_length = 0;
         $top = self::numSentencesForSummary($sentence_scores, $sentences);
+        if ($top <= 1) {
+            if (!empty($sentences[0])) {
+                $summary = $sentences[0];
+                return [ltrim($summary), [1]];
+            }
+        }
         $summary_indices = array_keys(array_slice($sentence_scores, 0,
             $top - 1, true));
         sort($summary_indices);
diff --git a/src/locale/ar/configure.ini b/src/locale/ar/configure.ini
index e4da687cb..d2b08b6c1 100755
--- a/src/locale/ar/configure.ini
+++ b/src/locale/ar/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/bn/configure.ini b/src/locale/bn/configure.ini
index 6b03cf887..a8ecef528 100755
--- a/src/locale/bn/configure.ini
+++ b/src/locale/bn/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/de/configure.ini b/src/locale/de/configure.ini
index 613e7fa36..18b2a829f 100755
--- a/src/locale/de/configure.ini
+++ b/src/locale/de/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini
index 2796e15c7..283fadf7c 100644
--- a/src/locale/en_US/configure.ini
+++ b/src/locale/en_US/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = "By Uri"
 crawl_component_test_upload = "By File Upload"
 crawl_component_test_input = "By Direct Input"
 crawl_component_page_options_running_tests = "Running Tests!"
+crawl_component_page_options_no_processor = "No Processor for Mimetype Found!"
 crawl_component_scraper_missing = "Missing Scraper Fields!"
 crawl_component_scraper_added = "Scraper added!"
 crawl_component_no_delete_scraper = "Scraper not deleted!"
diff --git a/src/locale/es/configure.ini b/src/locale/es/configure.ini
index 90e740ab5..f8b8f46b8 100755
--- a/src/locale/es/configure.ini
+++ b/src/locale/es/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/fa/configure.ini b/src/locale/fa/configure.ini
index 0e0824d96..66dd15532 100755
--- a/src/locale/fa/configure.ini
+++ b/src/locale/fa/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/fr_FR/configure.ini b/src/locale/fr_FR/configure.ini
index 61abac3f6..0854d61ab 100755
--- a/src/locale/fr_FR/configure.ini
+++ b/src/locale/fr_FR/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/he/configure.ini b/src/locale/he/configure.ini
index eb0b1bbd8..2b63603a5 100755
--- a/src/locale/he/configure.ini
+++ b/src/locale/he/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/hi/configure.ini b/src/locale/hi/configure.ini
index 58abae73d..56b381f2f 100755
--- a/src/locale/hi/configure.ini
+++ b/src/locale/hi/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/in_ID/configure.ini b/src/locale/in_ID/configure.ini
index 7e614e4be..8f4b86bb1 100755
--- a/src/locale/in_ID/configure.ini
+++ b/src/locale/in_ID/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/it/configure.ini b/src/locale/it/configure.ini
index 0af750108..7e9f9f8a9 100755
--- a/src/locale/it/configure.ini
+++ b/src/locale/it/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/ja/configure.ini b/src/locale/ja/configure.ini
index 1b9348240..a8d958685 100755
--- a/src/locale/ja/configure.ini
+++ b/src/locale/ja/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/kn/configure.ini b/src/locale/kn/configure.ini
index 0a4be1068..5898ce08d 100755
--- a/src/locale/kn/configure.ini
+++ b/src/locale/kn/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/ko/configure.ini b/src/locale/ko/configure.ini
index ff545c472..028e5e469 100755
--- a/src/locale/ko/configure.ini
+++ b/src/locale/ko/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/nl/configure.ini b/src/locale/nl/configure.ini
index dc77d9207..285828dd8 100644
--- a/src/locale/nl/configure.ini
+++ b/src/locale/nl/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = "Uitvoeren van tests!"
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/pl/configure.ini b/src/locale/pl/configure.ini
index d82992f35..1288a8cc8 100755
--- a/src/locale/pl/configure.ini
+++ b/src/locale/pl/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/pt/configure.ini b/src/locale/pt/configure.ini
index ca2df9d2f..6328b4f8d 100755
--- a/src/locale/pt/configure.ini
+++ b/src/locale/pt/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/ru/configure.ini b/src/locale/ru/configure.ini
index 4a544b5b9..976db947b 100755
--- a/src/locale/ru/configure.ini
+++ b/src/locale/ru/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/te/configure.ini b/src/locale/te/configure.ini
index c8d70e647..cca42f7bd 100644
--- a/src/locale/te/configure.ini
+++ b/src/locale/te/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = "టెస్ట్లు నడుస్తున్నవి!"
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/th/configure.ini b/src/locale/th/configure.ini
index 14a063d78..8014cab32 100755
--- a/src/locale/th/configure.ini
+++ b/src/locale/th/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/tr/configure.ini b/src/locale/tr/configure.ini
index 9bb9b135d..d2ee070a2 100755
--- a/src/locale/tr/configure.ini
+++ b/src/locale/tr/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/vi_VN/configure.ini b/src/locale/vi_VN/configure.ini
index a5683d7e0..a01e2386b 100755
--- a/src/locale/vi_VN/configure.ini
+++ b/src/locale/vi_VN/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/locale/zh_CN/configure.ini b/src/locale/zh_CN/configure.ini
index abe72c27e..f1355ae2d 100755
--- a/src/locale/zh_CN/configure.ini
+++ b/src/locale/zh_CN/configure.ini
@@ -437,6 +437,7 @@ crawl_component_test_uri = ""
 crawl_component_test_upload = ""
 crawl_component_test_input = ""
 crawl_component_page_options_running_tests = ""
+crawl_component_page_options_no_processor = ""
 crawl_component_scraper_missing = ""
 crawl_component_scraper_added = ""
 crawl_component_no_delete_scraper = ""
diff --git a/src/views/elements/PageoptionsElement.php b/src/views/elements/PageoptionsElement.php
index 68cf73d34..39d4f6603 100644
--- a/src/views/elements/PageoptionsElement.php
+++ b/src/views/elements/PageoptionsElement.php
@@ -221,7 +221,7 @@ class PageOptionsElement extends Element
             </label></b><input
             id='suffix-phrases' type="checkbox" name="SUFFIX_PHRASES"
             value="true"
-            <?php if (isset($data['SUFFIX_PHRASES']) &&
+            <?php if (!empty($data['SUFFIX_PHRASES']) &&
                 $data['SUFFIX_PHRASES'] != "false") {
                 e("checked='checked'");
              }?>
diff --git a/tests/IndexShardTest.php b/tests/IndexShardTest.php
index 88fef5a5f..8df9dd3c4 100644
--- a/tests/IndexShardTest.php
+++ b/tests/IndexShardTest.php
@@ -250,6 +250,7 @@ class IndexShardTest extends UnitTest
             $this->test_objects['shard2']);
         $c_data = $this->test_objects['shard']->getPostingsSliceById(
             L\crawlHashWord('BBBBBBBB', true), 5);
+        $tmp = array_keys($c_data);
         $this->assertTrue(isset($c_data["AAAAAAAA"]),
             "Data from first shard present 1");
         $c_data = $this->test_objects['shard']->getPostingsSliceById(
@@ -350,8 +351,6 @@ class IndexShardTest extends UnitTest
         $meta_ids = [];
         $this->test_objects['shard']->addDocumentWords($docid,
             $offset, $word_lists, $meta_ids);
-        $c_data = $this->test_objects['shard']->getPostingsSliceById(
-            L\crawlHashWord('BBBBBBBB', true), 5);
         $new_doc_offsets = [
             "AAAAAAAASSSSSSSS" => 5,
             "AAAAAAAAEEEEEEEEFFFFFFFF" => 10,
@@ -371,9 +370,9 @@ class IndexShardTest extends UnitTest
         $i = 0;
         foreach ($predicted_offsets as $key =>$offset) {
             $this->assertTrue(isset($c_data[$key]),
-                "Summary key matches predicted $i");
+                "Summary key matches predicted $key");
             $this->assertEqual($c_data[$key][CrawlConstants::SUMMARY_OFFSET],
-                $offset,  "Summary offset matches predicted $i");
+                $offset,  "Summary offset matches predicted offset $offset");
             $i++;
         }
         $c_data = $this->test_objects['shard']->getPostingsSliceById(
@@ -406,9 +405,9 @@ class IndexShardTest extends UnitTest
         ];
         foreach ($predicted_offsets as $key =>$offset) {
             $this->assertTrue(isset($c_data[$key]),
-                "Summary key matches predicted $i");
+                "Summary key matches predicted $key");
             $this->assertEqual($c_data[$key][CrawlConstants::SUMMARY_OFFSET],
-                $offset,  "Summary offset matches predicted $i");
+                $offset,  "Summary offset matches predicted offset $offset");
             $i++;
         }
     }
diff --git a/tests/PdfProcessorTest.php b/tests/PdfProcessorTest.php
index 897038b2e..2f5c58372 100644
--- a/tests/PdfProcessorTest.php
+++ b/tests/PdfProcessorTest.php
@@ -53,7 +53,7 @@ class PdfProcessorTest extends UnitTest implements CrawlConstants
     {
         $pdf_object = new PdfProcessor();
         $url = "http://www.yioop.com/test.pdf";
-        $filename = C\PARENT_DIR."/tests/test_files/test.pdf";
+        $filename = C\PARENT_DIR . "/tests/test_files/test.pdf";
         $page = file_get_contents($filename);
         $summary = $pdf_object->process($page, $url);
         $this->test_objects['summary'] = $summary;

ViewGit