diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php index 765c1f47f..dab0de7ca 100755 --- a/src/controllers/SearchController.php +++ b/src/controllers/SearchController.php @@ -1591,6 +1591,10 @@ class SearchController extends Controller implements CrawlConstants $crawl_item[self::URL]); $phrase_string = $host_words . " .. " . $crawl_item[self::TITLE] . " .. ". $path_words . " .. ". $crawl_item[self::DESCRIPTION]; + if (empty($crawl_item[self::LANG])) { + $crawl_item[self::LANG] = + L\guessLocaleFromString($phrase_string); + } $word_lists = PhraseParser::extractPhrasesInLists( $phrase_string, $crawl_item[self::LANG]); $len = strlen($phrase_string); diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index ecfa2ba0d..25ff84f55 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -1123,7 +1123,7 @@ class CrawlComponent extends Component implements CrawlConstants } } $seed_info = $crawl_model->getSeedInfo(); - $data['RECRAWL_FREQS'] = [-1=>tl('crawl_component_recrawl_never'), + $data['RECRAWL_FREQS'] = [-1 => tl('crawl_component_recrawl_never'), 1=>tl('crawl_component_recrawl_1day'), 2=>tl('crawl_component_recrawl_2day'), 3=>tl('crawl_component_recrawl_3day'), @@ -1256,7 +1256,9 @@ class CrawlComponent extends Component implements CrawlConstants $seed_info['indexing_plugins']['plugins'] : []; foreach ($parent->getIndexingPluginList() as $plugin) { - if ($plugin == "") {continue; } + if ($plugin == "") { + continue; + } $plugin_name = ucfirst($plugin); $data['INDEXING_PLUGINS'][$plugin_name]['checked'] = (in_array($plugin_name, $included_plugins)) ? @@ -1518,6 +1520,11 @@ class CrawlComponent extends Component implements CrawlConstants } L\convertUtf8IfNeeded($site, self::PAGE, self::ENCODING); $data['TESTPAGE'] = $site[self::PAGE]; + if (empty(PageProcessor::$mime_processor[$site[self::TYPE]])) { + return $parent->redirectWithMessage( + tl('crawl_component_page_options_no_processor'), + ["option_type"], true); + } $processor_name = PageProcessor::$mime_processor[$site[self::TYPE]]; $plugin_processors = []; if (isset($seed_info['indexing_plugins']['plugins'])) { diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 482a00ce8..6837951b0 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -1770,7 +1770,7 @@ class Fetcher implements CrawlConstants $processor = $this->pageProcessor($type); if (!$processor) { L\crawlLog("No page processor for mime type: ".$type); - L\crawlLog("Not processing: ".$site[self::URL]); + L\crawlLog("Not processing: " . $site[self::URL]); continue; } $text_data = $processor->text_data; diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index d4110f010..9b20967a6 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -768,6 +768,20 @@ class IndexShard extends PersistentStructure implements CrawlConstants $id_pos + self::DOC_ID_LEN, ($num_keys - 3) * $doc_key_len); list($item[self::DESCRIPTION_SCORES], $item[self::USER_RANKS]) = $this->unpackAuxiliaryDocumentKeys($aux_key_string); + } else if ($num_keys == 3) { + $test_id = rtrim($doc_id, "\x00"); + if (strlen($test_id) + self::DOC_KEY_LEN == strlen($doc_id) ) { + $doc_id = $test_id; + } + } else if ($num_keys == 2) { + $test_id = rtrim($doc_id, "\xFF"); + if (strlen($test_id) + self::DOC_KEY_LEN == strlen($doc_id)) { + $doc_id = $test_id; + $test_id = rtrim($doc_id, "\x00"); + if (strlen($test_id) + self::DOC_KEY_LEN == strlen($doc_id)) { + $doc_id = $test_id; + } + } } $occurrences = $this->weightedCount($position_list, $is_doc, $title_length, $item[self::DESCRIPTION_SCORES]); @@ -839,8 +853,10 @@ class IndexShard extends PersistentStructure implements CrawlConstants * * @param array $position_list positions of term in item * @param bool $is_doc whether the item is a document or a link - * @param int $title_length - * @param array $position_scores + * @param int $title_length position in position list at which point + * no longer in title of original doc + * @param array $position_scores pairs position => weight + * saying how much a word at a given position range is worth * @return array asscoiative array of document_part => weight count * of occurrences of term in * @@ -940,7 +956,6 @@ class IndexShard extends PersistentStructure implements CrawlConstants } $num_term_occurrences = $num_doc_or_links * $num_docs/($total_docs_or_links); - $IDF = log(($num_docs - $num_term_occurrences + $half) / ($num_term_occurrences + $half)); $item[self::RELEVANCE] += $half * $IDF * $pre_relevance * $type_weight; @@ -1369,11 +1384,16 @@ class IndexShard extends PersistentStructure implements CrawlConstants $id = substr($this->doc_infos, $i + $doc_key_len, $doc_id_len); /* id is only three keys of the list of keys, remaining keys used for ranker */ + $test_id = rtrim($id, "\x00"); + if (strlen($test_id) + self::DOC_KEY_LEN == strlen($id)) { + $id = $test_id; + } if (isset($docid_offsets[$id])) { charCopy(packInt($docid_offsets[$id]), $this->doc_infos, $i, $posting_len); } else if ($offset == self::NEEDS_OFFSET_FLAG) { - crawlLog("Document:".toHexString($id)." still needs offset"); + crawlLog("Document:" . toHexString($id) . + " still needs offset"); } } } diff --git a/src/library/PartialZipArchive.php b/src/library/PartialZipArchive.php index 685092811..1c2ce43ed 100644 --- a/src/library/PartialZipArchive.php +++ b/src/library/PartialZipArchive.php @@ -87,12 +87,18 @@ class PartialZipArchive if (!$sub_file) { continue; } + /* + Info for offsets can be found at: + https://en.wikipedia.org/wiki/Zip_(file_format)#Local_file_header + Notice 4 bytes are consumed when do explode, so value below are + all 4 less than there + */ $len_string = substr($sub_file, 22, 2); $file_name_len = (ord($len_string[1]) << 8) + ord($len_string[0]); $len_string = substr($sub_file, 24, 2); $extra_field_len = (ord($len_string[1]) << 8) + ord($len_string[0]); $file_start = 26 + $file_name_len + $extra_field_len; - $len_string = substr($sub_file, 18, 4); + $len_string = substr($sub_file, 14, 4); $file_size = (((((ord($len_string[3]) << 8) + ord($len_string[2])) << 8) + ord($len_string[1])) << 8) + ord($len_string[0]); diff --git a/src/library/UrlParser.php b/src/library/UrlParser.php index 36b01ee09..fd22f9940 100755 --- a/src/library/UrlParser.php +++ b/src/library/UrlParser.php @@ -561,7 +561,7 @@ class UrlParser if ($url == "") { return $default; } - $url_parts = @parse_url(urlencode($url)); + $url_parts = @parse_url(str_replace("%3F", "?", urlencode($url))); if (!isset($url_parts['path'])) { return $default; } else if ($url[strlen($url)-1] == "/" || $url[strlen($url)-1] == "\\"){ diff --git a/src/library/processors/HtmlProcessor.php b/src/library/processors/HtmlProcessor.php index 82d14fed2..0cc9f138d 100755 --- a/src/library/processors/HtmlProcessor.php +++ b/src/library/processors/HtmlProcessor.php @@ -121,7 +121,6 @@ class HtmlProcessor extends TextProcessor } $summary[self::LANG] = self::lang($dom, $summary[self::TITLE], $url); - echo $summary[self::LANG]; $description_dom = $dom; if (!empty($scraper)) { $scrape_results = ScraperManager::applyScraperRules( @@ -294,21 +293,16 @@ class HtmlProcessor extends TextProcessor public static function title($dom) { $xpath = new \DOMXPath($dom); - $titles = $xpath->evaluate("/html//title"); + $title_parts = ["/html/head/title", + "/html//title", "/html//h1", "/html//h2", + "/html//h3", "/html//h4", "/html//h5", "/html//h6"]; $title = ""; - foreach ($titles as $pre_title) { - $title .= $pre_title->nodeValue; - } - if ($title == "") { - $title_parts = ["/html//h1", "/html//h2", "/html//h3", - "/html//h4", "/html//h5", "/html//h6"]; - foreach ($title_parts as $part) { - $doc_nodes = $xpath->evaluate($part); - foreach ($doc_nodes as $node) { - $title .= " .. " . $node->nodeValue; - if (strlen($title) > self::MAX_TITLE_LEN) { - break 2; - } + foreach ($title_parts as $part) { + $doc_nodes = $xpath->evaluate($part); + foreach ($doc_nodes as $node) { + $title = trim($node->nodeValue); + if (!empty($title)) { + break 2; } } } @@ -349,85 +343,6 @@ class HtmlProcessor extends TextProcessor $body = preg_replace("/\s+/", " ", $body); return mb_substr($body, 0, self::$max_description_len); } - /** - * Returns descriptive text concerning a webpage based on its document - * object - * - * @param object $dom a document object to extract a description from. - * @param string $page original page string to extract description from - * @return string a description of the page - */ - public static function description($dom, $page) - { - $xpath = new \DOMXPath($dom); - $metas = $xpath->evaluate("/html//meta"); - $description = ""; - //look for a meta tag with a description - foreach ($metas as $meta) { - if (stristr($meta->getAttribute('name'), "description")) { - $description .= " .. " . $meta->getAttribute('content'); - } - } - if (self::$max_description_len > 2 * C\MAX_DESCRIPTION_LEN) { - /* if don't need to summarize much, take meta description - from above code, then concatenate body of doc - after stripping tags, return result - */ - $description .= "\n" . self::crudeDescription($page); - return $description; - } - /* - concatenate the contents of then additional dom elements up to - the limit of description length. Choose tags in order of likely - importance to this doc - */ - $page_parts = ["/html//p[1]", - "/html//div[1]", "/html//p[2]", "/html//div[2]", "/html//p[3]", - "/html//div[3]", "/html//p[4]", "/html//div[4]", - "/html//td", "/html//li", "/html//dt", "/html//dd", - "/html//pre", "/html//a", "/html//article", - "/html//section", "/html//cite"]; - $para_data = []; - $len = 0; - foreach ($page_parts as $part) { - $doc_nodes = $xpath->evaluate($part); - foreach ($doc_nodes as $node) { - if ($part == "/html//a") { - $content = $node->getAttribute('href')." = "; - $add_len = min(self::$max_description_len / 2, - mb_strlen($content)); - $para_data[$add_len][] = mb_substr($content, 0, $add_len); - } - $node_text = self::domNodeToString($node); - $add_len = min(self::$max_description_len / 2, - mb_strlen($node_text)); - $para_data[$add_len][] = mb_substr($node_text, 0, $add_len); - $len += $add_len; - if ($len > self::$max_description_len) { - break 2; - } - if (in_array($part, ["/html//p[1]", "/html//div[1]", - "/html//div[2]", "/html//p[2]", "/html//p[3]", - "/html//div[3]", "/html//div[4]", "/html//p[4]"])){ - break; - } - } - } - krsort($para_data); - foreach ($para_data as $add_len => $data) { - if (!isset($first_len)) { - $first_len = $add_len; - } - foreach ($data as $datum) { - $description .= " .. ". $datum; - } - if ($first_len > 3 * $add_len) { - break; - } - } - $description = preg_replace("/(\s)+/u", " ", $description); - return $description; - } /** * Extracts are location of refresh urls from the meta tags of html page * in site diff --git a/src/library/processors/PdfProcessor.php b/src/library/processors/PdfProcessor.php index dea673e6f..32752e11d 100755 --- a/src/library/processors/PdfProcessor.php +++ b/src/library/processors/PdfProcessor.php @@ -76,40 +76,51 @@ class PdfProcessor extends TextProcessor { $text = ""; if (is_string($page)) { - $encoding = self::getEncoding($page); + list($encoding, $title) = self::getEncodingTitle($page); $text = self::getText($page, $encoding); } if ($text == "") { $text = $url; } $summary = parent::process($text, $url); + if ($title) { + $summary[self::TITLE] = $title; + } return $summary; } /** * Returns the first encoding format information found in the PDF document * * @param string $pdf_string a string representing the PDF document - * @return string which of the default (if any) PDF encoding formats is - * being used: MacRomanEncoding, WinAnsiEncoding, PDFDocEncoding, etc. + * @return array [encoding, title] which of the default (if any) PDF + * encoding formats is being used: MacRomanEncoding, WinAnsiEncoding, + * PDFDocEncoding, etc as well as a title for the document if found + * */ - public static function getEncoding($pdf_string) + public static function getEncodingTitle($pdf_string) { $len = strlen($pdf_string); $cur_pos = 0; $out = ""; $i = 0; set_error_handler(null); - while($cur_pos < $len) { + $encoding = ""; + $title = ""; + while($cur_pos < $len && (!$encoding || !$title)) { list($cur_pos, $object_string) = self::getNextObject($pdf_string, $cur_pos); $object_dictionary = self::getObjectDictionary($object_string); if (preg_match("/\/(\w+Encoding)\b/", $object_dictionary, $match) != false) { - return $match[1]; + $encoding = $match[1]; + } + if (preg_match("/\/Title\(([^\)]+)\)/", $object_dictionary, + $match) != false) { + $title = $match[1]; } } set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); - return ""; + return [$encoding, $title]; } /** * Gets the text out of a PDF document @@ -127,30 +138,38 @@ class PdfProcessor extends TextProcessor $out = ""; $i = 0; set_error_handler(null); + $state = "text"; while($cur_pos < $len) { list($cur_pos, $object_string) = self::getNextObject($pdf_string, $cur_pos); $object_dictionary = self::getObjectDictionary($object_string); + if (self::objectDictionaryHas( + $object_dictionary, ["Type", "Font", "FontDescriptor"])) { + $state = "font"; + continue; + } if (!self::objectDictionaryHas( $object_dictionary, ["Image", "Catalog"])) { $stream_data = rtrim(ltrim(self::getObjectStream($object_string))); + if ($state == 'font') { + $state == 'text'; + continue; + } if (self::objectDictionaryHas( $object_dictionary, ["FlateDecode"])) { $stream_data = @gzuncompress($stream_data); if (strpos($stream_data, "PS-AdobeFont")) { - $out .= $stream_data; - break; + $out .= $stream_data . "\n\n"; } $text = self::parseText($stream_data, $encoding); - $out .= $text; + $out .= $text. "\n\n"; } else { $text = self::parseText($stream_data, $encoding); if (strpos($stream_data, "PS-AdobeFont")){ - $out .= $stream_data; - break; + $out .= $stream_data . "\n\n"; } - $out .= $text; + $out .= $text . "\n\n"; } } } @@ -241,14 +260,14 @@ class PdfProcessor extends TextProcessor $data); $len = strlen($data); $out = ""; - $escape_flag =false; + $escape_flag = false; while($cur_pos < $len) { $cur_char = $data[$cur_pos]; if ($cur_char == '[' && !$escape_flag) { list($cur_pos, $text) = self::parseBrackets($data, $cur_pos, $encoding); $cur_pos--; - $out .= " " . $text; + $out .= " ". $text; } if ($cur_char == '\\') { $escape_flag = true; @@ -276,7 +295,7 @@ class PdfProcessor extends TextProcessor $len = strlen($data); $out = ""; $escape_flag =false; - $cur_char=""; + $cur_char = ""; while($cur_pos < $len && ($cur_char != "]")) { $cur_char = $data[$cur_pos]; if ($cur_char == '(') { diff --git a/src/library/processors/SvgProcessor.php b/src/library/processors/SvgProcessor.php index fc8654583..66fe6760e 100644 --- a/src/library/processors/SvgProcessor.php +++ b/src/library/processors/SvgProcessor.php @@ -46,7 +46,6 @@ use seekquarry\yioop\library\UrlParser; */ class SvgProcessor extends TextProcessor { - const MAX_THUMB_LEN = 5000; /** * Set-ups the any indexing plugins associated with this page * processor @@ -79,7 +78,6 @@ class SvgProcessor extends TextProcessor * used to canonicalize relative links * * @return array a summary of the contents of the page - * */ public function process($page, $url) { @@ -90,15 +88,15 @@ class SvgProcessor extends TextProcessor $dom = self::dom($page); } if ($dom !== false && isset($dom->documentElement)) { - $summary[self::TITLE] = ""; - $summary[self::DESCRIPTION] = self::description($dom); + $summary[self::TITLE] = self::title($dom, $page); + $summary[self::DESCRIPTION] = self::description($dom, $page); $summary[self::LINKS] = []; $summary[self::PAGE] = "<!DOCTYPE html>" . "<html><body><div><img src='data:image/svg+xml;base64," . base64_encode($page)."' alt='".$summary[self::DESCRIPTION]. "' /></div></body></html>"; - if (strlen($page) < self::MAX_THUMB_LEN) { - $thumb_string = self::createThumb($dom); + if (strlen($page) < C\PAGE_RANGE_REQUEST) { + $thumb_string = self::createThumb($dom, $page); $summary[self::THUMB] = 'data:image/svg+xml;base64,'. base64_encode($thumb_string); } @@ -112,9 +110,11 @@ class SvgProcessor extends TextProcessor * Used to create an svg thumbnail from a dom object * * @param object $dom a dom svg image object - * + * @param string $page content of file to fall back on in case dom + * manipulation fails + * @return string containing svg image of thumb */ - public static function createThumb($dom) + public static function createThumb($dom, $page) { $svg = $dom->documentElement; if ($svg->hasAttribute("width")) { @@ -134,7 +134,13 @@ class SvgProcessor extends TextProcessor if (!$svg->hasAttribute("viewBox")) { $svg->setAttributeNS("", "viewBox", "0 0 $width $height"); } - return $dom->saveXML(); + $thumb_string = $dom->saveXML(); + if (!empty($thumb_string)) { + return $thumb_string; + } + $thumb_string = preg_replace('/\<svg\s(^\>+)\>/', + "<svg width='150px' height='150px' >", $page); + return $thumb_string; } /** * Return a document object based on a string containing the contents of @@ -146,27 +152,40 @@ class SvgProcessor extends TextProcessor */ public static function dom($page) { - $dom = new \DOMDocument(); - set_error_handler(null); - @$dom->loadXML($page); - set_error_handler(C\NS_CONFIGS . "yioop_error_handler"); - return $dom; + return L\getDomFromString($page); } /** - * Returns html head title of a webpage based on its document object + * Returns title of a svg page based on its document object * - * @param object $dom a document object to extract a title from. + * @param object $dom a document object to extract a title from. + * @param string $page content of file to fall back on in case xpath + * doesn't work * @return string a title of the page * */ - public static function title($dom) + public static function title($dom, $page) { - $sites = []; $xpath = new \DOMXPath($dom); - $titles = $xpath->evaluate("/svg//desc"); + $title_parts = ["/svg//title", "/svg//desc"]; $title = ""; - foreach ($titles as $pre_title) { - $title .= $pre_title->textContent; + foreach ($title_parts as $part) { + $doc_nodes = $xpath->evaluate($part); + foreach ($doc_nodes as $node) { + $title = trim($node->nodeValue); + if (!empty($title)) { + break 2; + } + } + } + if (empty($title)) { + list(, $title) = parent::getBetweenTags($page, 0, "<title", + "</title"); + return strip_tags("<title" . $title . "</title>"); + } + if (empty($title)) { + list(, $title) = parent::getBetweenTags($page, 0, "<desc", + "</desc"); + return strip_tags("<desc" . $title . "</desc>"); } return $title; } @@ -175,9 +194,11 @@ class SvgProcessor extends TextProcessor * object * * @param object $dom a document object to extract a description from. + * @param string $page content of file to fall back on in case xpath + * doesn't work * @return string a description of the page */ - public static function description($dom) + public static function description($dom, $page) { $sites = []; $xpath = new \DOMXPath($dom); @@ -191,11 +212,21 @@ class SvgProcessor extends TextProcessor $doc_nodes = $xpath->evaluate($part); foreach ($doc_nodes as $node) { $description .= " ".$node->textContent; - if (strlen($description) > self::$max_description_len){ + if (strlen($description) > self::$max_description_len) { break 2; } } } + if (empty($description)) { + list(, $description) = parent::getBetweenTags($page, 0, "<desc", + "</desc"); + return strip_tags("<desc" . $description . "</desc>"); + } + if (empty($description)) { + list(, $description) = parent::getBetweenTags($page, 0, "<text", + "</text"); + return strip_tags("<text" . $description . "</text>"); + } $description = mb_ereg_replace("(\s)+", " ", $description); return $description; } diff --git a/src/library/processors/TextProcessor.php b/src/library/processors/TextProcessor.php index d7e2c6531..19fb8d597 100755 --- a/src/library/processors/TextProcessor.php +++ b/src/library/processors/TextProcessor.php @@ -98,14 +98,14 @@ class TextProcessor extends PageProcessor $dom = self::dom($remove_styles_page); $summary[self::TITLE] = ""; $summary[self::LANG] = self::calculateLang($remove_styles_page); + $summary[self::PAGE] = "<html><body><div><pre>" . + strip_tags($remove_styles_page) . "</pre></div></body></html>"; list($summary[self::DESCRIPTION], $summary[self::WORD_CLOUD], $summary[self::DESCRIPTION_SCORES]) = - $this->summarizer->getSummary($dom, $remove_styles_page, + $this->summarizer->getSummary($dom, $summary[self::PAGE], $summary[self::LANG]); $summary[self::LINKS] = self::extractHttpHttpsUrls( $remove_styles_page); - $summary[self::PAGE] = "<html><body><div><pre>" . - strip_tags($remove_styles_page) . "</pre></div></body></html>"; } return $summary; } @@ -245,7 +245,9 @@ class TextProcessor extends PageProcessor $body_tags = "<frameset><frame><noscript><img><span><b><i><em>". "<strong><h1><h2><h3><h4><h5><h6><p><div>". "<a><table><tr><td><th><dt><dir><dl><dd>"; - $body = strip_tags($page, $body_tags); + $body = "\n\n" . strip_tags($page, $body_tags). "\n\n"; + $body = preg_replace("/\n\n(.+)\n\n/s", '<div>\n$1\n</div>\n', + $body); $page = "<html><head>$head</head><body>$body</body></html>"; } $dom = L\getDomFromString($page); diff --git a/src/library/summarizers/ScrapeSummarizer.php b/src/library/summarizers/ScrapeSummarizer.php index 27e70fcf9..8d4f085a0 100644 --- a/src/library/summarizers/ScrapeSummarizer.php +++ b/src/library/summarizers/ScrapeSummarizer.php @@ -89,11 +89,14 @@ class ScrapeSummarizer extends Summarizer $block = trim(strip_tags($block)); $score_pos = pow(1 + $pos, 0.5); if (!empty($block)) { + $sentences = self::getSentences($block); $weight = (empty($tag_weights[$tag_name])) ? 1.0 : $tag_weights[$tag_name]; - $blocks[] = $block; - $block_ranks[] = ($weight * log(strlen($block) + 1)) / - $score_pos; + foreach ($sentences as $sentence) { + $blocks[] = $sentence; + $block_ranks[] = ($weight * log(strlen($sentence) + 1)) / + $score_pos; + } } if ($pos > 0 && !empty($block_ranks[$changeable_index]) && $max_score/$score_pos < $block_ranks[$changeable_index]) { diff --git a/src/library/summarizers/Summarizer.php b/src/library/summarizers/Summarizer.php index 0d1aa36c9..1113f998f 100644 --- a/src/library/summarizers/Summarizer.php +++ b/src/library/summarizers/Summarizer.php @@ -351,6 +351,12 @@ class Summarizer $summary = ""; $summary_length = 0; $top = self::numSentencesForSummary($sentence_scores, $sentences); + if ($top <= 1) { + if (!empty($sentences[0])) { + $summary = $sentences[0]; + return [ltrim($summary), [1]]; + } + } $summary_indices = array_keys(array_slice($sentence_scores, 0, $top - 1, true)); sort($summary_indices); diff --git a/src/locale/ar/configure.ini b/src/locale/ar/configure.ini index e4da687cb..d2b08b6c1 100755 --- a/src/locale/ar/configure.ini +++ b/src/locale/ar/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/bn/configure.ini b/src/locale/bn/configure.ini index 6b03cf887..a8ecef528 100755 --- a/src/locale/bn/configure.ini +++ b/src/locale/bn/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/de/configure.ini b/src/locale/de/configure.ini index 613e7fa36..18b2a829f 100755 --- a/src/locale/de/configure.ini +++ b/src/locale/de/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini index 2796e15c7..283fadf7c 100644 --- a/src/locale/en_US/configure.ini +++ b/src/locale/en_US/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "By Uri" crawl_component_test_upload = "By File Upload" crawl_component_test_input = "By Direct Input" crawl_component_page_options_running_tests = "Running Tests!" +crawl_component_page_options_no_processor = "No Processor for Mimetype Found!" crawl_component_scraper_missing = "Missing Scraper Fields!" crawl_component_scraper_added = "Scraper added!" crawl_component_no_delete_scraper = "Scraper not deleted!" diff --git a/src/locale/es/configure.ini b/src/locale/es/configure.ini index 90e740ab5..f8b8f46b8 100755 --- a/src/locale/es/configure.ini +++ b/src/locale/es/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/fa/configure.ini b/src/locale/fa/configure.ini index 0e0824d96..66dd15532 100755 --- a/src/locale/fa/configure.ini +++ b/src/locale/fa/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/fr_FR/configure.ini b/src/locale/fr_FR/configure.ini index 61abac3f6..0854d61ab 100755 --- a/src/locale/fr_FR/configure.ini +++ b/src/locale/fr_FR/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/he/configure.ini b/src/locale/he/configure.ini index eb0b1bbd8..2b63603a5 100755 --- a/src/locale/he/configure.ini +++ b/src/locale/he/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/hi/configure.ini b/src/locale/hi/configure.ini index 58abae73d..56b381f2f 100755 --- a/src/locale/hi/configure.ini +++ b/src/locale/hi/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/in_ID/configure.ini b/src/locale/in_ID/configure.ini index 7e614e4be..8f4b86bb1 100755 --- a/src/locale/in_ID/configure.ini +++ b/src/locale/in_ID/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/it/configure.ini b/src/locale/it/configure.ini index 0af750108..7e9f9f8a9 100755 --- a/src/locale/it/configure.ini +++ b/src/locale/it/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/ja/configure.ini b/src/locale/ja/configure.ini index 1b9348240..a8d958685 100755 --- a/src/locale/ja/configure.ini +++ b/src/locale/ja/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/kn/configure.ini b/src/locale/kn/configure.ini index 0a4be1068..5898ce08d 100755 --- a/src/locale/kn/configure.ini +++ b/src/locale/kn/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/ko/configure.ini b/src/locale/ko/configure.ini index ff545c472..028e5e469 100755 --- a/src/locale/ko/configure.ini +++ b/src/locale/ko/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/nl/configure.ini b/src/locale/nl/configure.ini index dc77d9207..285828dd8 100644 --- a/src/locale/nl/configure.ini +++ b/src/locale/nl/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "Uitvoeren van tests!" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/pl/configure.ini b/src/locale/pl/configure.ini index d82992f35..1288a8cc8 100755 --- a/src/locale/pl/configure.ini +++ b/src/locale/pl/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/pt/configure.ini b/src/locale/pt/configure.ini index ca2df9d2f..6328b4f8d 100755 --- a/src/locale/pt/configure.ini +++ b/src/locale/pt/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/ru/configure.ini b/src/locale/ru/configure.ini index 4a544b5b9..976db947b 100755 --- a/src/locale/ru/configure.ini +++ b/src/locale/ru/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/te/configure.ini b/src/locale/te/configure.ini index c8d70e647..cca42f7bd 100644 --- a/src/locale/te/configure.ini +++ b/src/locale/te/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "టెస్ట్లు నడుస్తున్నవి!" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/th/configure.ini b/src/locale/th/configure.ini index 14a063d78..8014cab32 100755 --- a/src/locale/th/configure.ini +++ b/src/locale/th/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/tr/configure.ini b/src/locale/tr/configure.ini index 9bb9b135d..d2ee070a2 100755 --- a/src/locale/tr/configure.ini +++ b/src/locale/tr/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/vi_VN/configure.ini b/src/locale/vi_VN/configure.ini index a5683d7e0..a01e2386b 100755 --- a/src/locale/vi_VN/configure.ini +++ b/src/locale/vi_VN/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/locale/zh_CN/configure.ini b/src/locale/zh_CN/configure.ini index abe72c27e..f1355ae2d 100755 --- a/src/locale/zh_CN/configure.ini +++ b/src/locale/zh_CN/configure.ini @@ -437,6 +437,7 @@ crawl_component_test_uri = "" crawl_component_test_upload = "" crawl_component_test_input = "" crawl_component_page_options_running_tests = "" +crawl_component_page_options_no_processor = "" crawl_component_scraper_missing = "" crawl_component_scraper_added = "" crawl_component_no_delete_scraper = "" diff --git a/src/views/elements/PageoptionsElement.php b/src/views/elements/PageoptionsElement.php index 68cf73d34..39d4f6603 100644 --- a/src/views/elements/PageoptionsElement.php +++ b/src/views/elements/PageoptionsElement.php @@ -221,7 +221,7 @@ class PageOptionsElement extends Element </label></b><input id='suffix-phrases' type="checkbox" name="SUFFIX_PHRASES" value="true" - <?php if (isset($data['SUFFIX_PHRASES']) && + <?php if (!empty($data['SUFFIX_PHRASES']) && $data['SUFFIX_PHRASES'] != "false") { e("checked='checked'"); }?> diff --git a/tests/IndexShardTest.php b/tests/IndexShardTest.php index 88fef5a5f..8df9dd3c4 100644 --- a/tests/IndexShardTest.php +++ b/tests/IndexShardTest.php @@ -250,6 +250,7 @@ class IndexShardTest extends UnitTest $this->test_objects['shard2']); $c_data = $this->test_objects['shard']->getPostingsSliceById( L\crawlHashWord('BBBBBBBB', true), 5); + $tmp = array_keys($c_data); $this->assertTrue(isset($c_data["AAAAAAAA"]), "Data from first shard present 1"); $c_data = $this->test_objects['shard']->getPostingsSliceById( @@ -350,8 +351,6 @@ class IndexShardTest extends UnitTest $meta_ids = []; $this->test_objects['shard']->addDocumentWords($docid, $offset, $word_lists, $meta_ids); - $c_data = $this->test_objects['shard']->getPostingsSliceById( - L\crawlHashWord('BBBBBBBB', true), 5); $new_doc_offsets = [ "AAAAAAAASSSSSSSS" => 5, "AAAAAAAAEEEEEEEEFFFFFFFF" => 10, @@ -371,9 +370,9 @@ class IndexShardTest extends UnitTest $i = 0; foreach ($predicted_offsets as $key =>$offset) { $this->assertTrue(isset($c_data[$key]), - "Summary key matches predicted $i"); + "Summary key matches predicted $key"); $this->assertEqual($c_data[$key][CrawlConstants::SUMMARY_OFFSET], - $offset, "Summary offset matches predicted $i"); + $offset, "Summary offset matches predicted offset $offset"); $i++; } $c_data = $this->test_objects['shard']->getPostingsSliceById( @@ -406,9 +405,9 @@ class IndexShardTest extends UnitTest ]; foreach ($predicted_offsets as $key =>$offset) { $this->assertTrue(isset($c_data[$key]), - "Summary key matches predicted $i"); + "Summary key matches predicted $key"); $this->assertEqual($c_data[$key][CrawlConstants::SUMMARY_OFFSET], - $offset, "Summary offset matches predicted $i"); + $offset, "Summary offset matches predicted offset $offset"); $i++; } } diff --git a/tests/PdfProcessorTest.php b/tests/PdfProcessorTest.php index 897038b2e..2f5c58372 100644 --- a/tests/PdfProcessorTest.php +++ b/tests/PdfProcessorTest.php @@ -53,7 +53,7 @@ class PdfProcessorTest extends UnitTest implements CrawlConstants { $pdf_object = new PdfProcessor(); $url = "http://www.yioop.com/test.pdf"; - $filename = C\PARENT_DIR."/tests/test_files/test.pdf"; + $filename = C\PARENT_DIR . "/tests/test_files/test.pdf"; $page = file_get_contents($filename); $summary = $pdf_object->process($page, $url); $this->test_objects['summary'] = $summary;