mb = function_exists("mb_internal_encoding"); parent::__construct(); } /** * Convert HTML markup to readable text * * Like PHP’s strip_tags but with some small improvements in HTML-to-text conversion that * improves the readability of the text. * * In 3.0.197+ inner content of script, style and object tags is now removed, rather than just the tags. * To revert this behavior or to remove content of additional tags, see the `clearTags` option. * * Note that this method differs from the `Sanitizer::markupToText()` method in that this method is newer, * more powerful and has more options. But the two methods differ in how they perform markup-to-text * conversion so you may want to review and try both to determine which one better suits your needs. * * @param string $str String to convert to text * @param array $options * - `keepTags` (array): Tag names to keep in returned value, i.e. [ "em", "strong" ]. (default=none) * - `clearTags` (array): Tags that should also have their content cleared. (default=[ "script", "style", "object" ]) Since 3.0.197 * - `splitBlocks` (string): String to split paragraph and header elements. (default="\n\n") * - `convertEntities` (bool): Convert HTML entities to plain text equivalents? (default=true) * - `listItemPrefix` (string): Prefix for converted list item `
  • ` elements. (default='• ') * - `linksToUrls` (bool): Convert links to `(url)` rather than removing? (default=true) Since 3.0.132 * - `linksToMarkdown` (bool): Convert links to `[text](url)` rather than removing? (default=false) Since 3.0.197 * - `uppercaseHeadlines` (bool): Convert headline tags to uppercase? (default=false) Since 3.0.132 * - `underlineHeadlines` (bool): Underline headlines with "=" or "-"? (default=true) Since 3.0.132 * - `collapseSpaces` (bool): Collapse extra/redundant extra spaces to single space? (default=true) Since 3.0.132 * - `replacements` (array): Associative array of strings to manually replace. (default=[' ' => ' ']) * @return string * @see Sanitizer::markupToText() * */ public function markupToText($str, array $options = array()) { $sanitizer = $this->wire()->sanitizer; $defaults = array( 'keepTags' => array(), 'clearTags' => array('script', 'style', 'object'), 'linksToUrls' => true, // convert links to just URL rather than removing entirely 'linksToMarkdown' => false, // convert links to Markdown style links 'splitBlocks' => "\n\n", 'uppercaseHeadlines' => false, 'underlineHeadlines' => true, 'convertEntities' => true, 'listItemPrefix' => '• ', 'preIndent' => '', // indent for text within a
    			'collapseSpaces' => true,
    			'replacements' => array(
    				' ' => ' '
    			),
    			'finishReplacements' => array(), // replacements applied at very end (internal)
    		);
    
    		// merge options using arrays
    		foreach(array('replacements') as $key) {
    			if(!isset($options[$key])) continue;
    			$options[$key] = array_merge($defaults[$key], $options[$key]);
    		}
    		
    		$options = array_merge($defaults, $options);
    
    		if(strpos($str, '>') !== false) {
    
    			// strip out everything up to and including , if present
    			if(strpos($str, '') !== false) list(, $str) = explode('', $str);
    
    			// ensure tags are separated by whitespace
    			$str = str_replace('><', '> <', $str);
    
    			// normalize newlines
    			if(strpos($str, "\r") !== false) {
    				$str = str_replace(array("\r\n", "\r"), "\n", $str);
    			}
    
    			// normalize tabs to spaces
    			if(strpos($str, "\t") !== false) {
    				$str = str_replace("\t", " ", $str);
    			}
    
    			// ensure paragraphs and headers are followed by two newlines
    			if(stripos($str, ')!i', '$1' . $options['splitBlocks'], $str);
    			}
    
    			// ensure list items are on their own line and prefixed with a bullet
    			if(stripos($str, ']*>[\s\r\n]*!i', "\n
  • $prefix", $str); if($prefix) { $options['replacements']["\n$prefix "] = "\n$prefix"; // prevent extra space $prefix = trim($prefix); $options['finishReplacements']["\n$prefix\n$prefix"] = ""; // prevent blank items $options['finishReplacements']["\n$prefix\n"] = ""; } } // convert
    tags to be just a single newline if(stripos($str, '', '
    ', '
    ', '
  • '), "
    \n", $str); while(stripos($str, "\n
    ") !== false) $str = str_replace("\n
    ", "
    ", $str); while(stripos($str, "
    \n\n") !== false) $str = str_replace("
    \n\n", "
    \n", $str); } // make headlines more prominent with underlines or uppercase if(($options['uppercaseHeadlines'] || $options['underlineHeadlines']) && stripos($str, ']*>(.+?)!is', $str, $matches)) { foreach($matches[2] as $key => $headline) { $fullMatch = $matches[0][$key]; $tagName = strtolower($matches[1][$key]); $underline = ''; //$headline = trim($headline); if($options['underlineHeadlines']) { $char = $tagName === $topHtag ? '=' : '-'; $underline = "\n" . str_repeat($char, $this->strlen(trim(strip_tags($headline)))); } if($options['uppercaseHeadlines']) $headline = strtoupper($headline); $str = str_replace($fullMatch, "\n\n<$tagName>$headline$underline", $str); } } } // convert "text" tags to "text (url)" if(($options['linksToUrls'] || $options['linksToMarkdown']) && stripos($str, ']*href=([^\s>]+)[^<>]*>(.+?)!is', $str, $matches)) { $links = array(); foreach($matches[0] as $key => $fullMatch) { $href = trim($matches[1][$key], '"\''); if(strpos($href, '#') === 0) continue; // do not convert jumplinks $anchorText = trim($matches[2][$key]); $links[$fullMatch] = "[$anchorText]($href)"; } if(count($links)) { $str = str_replace(array_keys($links), array_values($links), $str); } unset($links); } } // indent within
    ...
    sections if(strlen($options['preIndent']) && strpos($str, '|\s[^>]*>)(.+?)
    !is', $str, $matches)) { foreach($matches[0] as $key => $fullMatch) { $lines = explode("\n", $matches[1][$key]); foreach($lines as $k => $line) { $lines[$k] = ':preIndent:' . rtrim($line); } $str = str_replace($fullMatch, implode("\n", $lines), $str); $options['finishReplacements'][':preIndent:'] = $options['preIndent']; unset($lines); } } } // strip tags AND their contents for specified tags foreach($options['clearTags'] as $s) { $s = strtolower($s); if(stripos($str, "<$s") === false) continue; $str = str_ireplace(array("<$s", " $part) { if(strpos($part, "") === false) { if($key > 0) unset($parts[$key]); // remove nested inner content } else { $endparts = explode("", $part); $parts[$key] = array_pop($endparts); // convert to content after last } } $str = implode("", $parts); unset($parts, $endparts, $s); } } // strip tags if(count($options['keepTags'])) { // some tags will be allowed to remain $keepTags = ''; foreach($options['keepTags'] as $tag) { $keepTags .= "<" . trim($tag, "<>") . ">"; } $str = strip_tags($str, $keepTags); } else { // not allowing any tags $str = strip_tags($str); // if any possible tag characters remain, drop them now $str = str_replace(array('<', '>'), ' ', $str); } // apply any other replacements foreach($options['replacements'] as $find => $replace) { $str = str_ireplace($find, $replace, $str); } // convert entities to plain text equivalents if($options['convertEntities'] && strpos($str, '&') !== false) { $str = $sanitizer->unentities($str); } // collapse any redundant/extra whitespace if($options['collapseSpaces']) { while(strpos($str, ' ') !== false) $str = str_replace(' ', ' ', $str); } // normalize newlines and whitespace around newlines while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str); while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str); while(strpos($str, "\n\n\n") !== false) $str = str_replace("\n\n\n", "\n\n", $str); if(strpos($str, '](')) { // contains links if(strpos($str, '[](') !== false || strpos($str, '[ ](') !== false) { // remove links that lack anchor text $str = preg_replace('!\[\s*\]\([^)]*\)!', '', $str); } if($options['linksToUrls']) { // convert markdown style "[text](url)" to "text (url)" if(!$options['linksToMarkdown']) $str = preg_replace('!\[\s*(.+?)\]\(!', '$1 (', $str); } } if(count($options['finishReplacements'])) { $str = str_replace(array_keys($options['finishReplacements']), array_values($options['finishReplacements']), $str); } return trim($str); } /** * Remove (or close) unclosed HTML tags from given string * * Remove unclosed tags: * --------------------- * At present, if it finds an unclosed tag, it removes all tags of the same kind. * This is in order to keep the function fast, by delegating what it can to strip_tags(). * This is sufficient for our internal use here, but may not be ideal for all situations. * * Fix/close unclosed tags: * ------------------------ * When the remove option is false, it will attempt to close unclosed tags rather than * remove them. It doesn't know exactly where they should be closed, so it appends the * close tags to the end of the string. * * @param string $str * @param bool $remove Remove unclosed tags? If false, it will attempt to close them instead. (default=true) * @param array $options * - `ignoreTags` (array): Tags that can be ignored because they close themselves. (default=per HTML spec) * @return string * */ public function fixUnclosedTags($str, $remove = true, $options = array()) { $defaults = array( 'ignoreTags' => array( 'area','base','br','col','command','embed','hr','img','input', 'keygen','link','menuitem','meta','param','source','track','wbr', ), ); if(isset($options['ignoreTags'])) { // merge user specified ignoreTags with our defaults so that both are used $options['ignoreTags'] = array_merge($defaults['ignoreTags'], $options['ignoreTags']); } $options = array_merge($defaults, $options); $tags = array(); $unclosed = array(); $n1 = substr_count($str, '>'); $n2 = substr_count($str, '" is equal to double the quantity of "'); if($n1 > $n2) { // string might end with a partial tag, i.e. "|\s*/>|\s[^>]+>)!i', $str, $matches)) return $str; foreach($matches[1] as $key => $tag) { if(strpos($matches[2][$key], '/>') !== false) continue; // ignore self closing tags if(in_array(strtolower($tag), $options['ignoreTags'])) continue; $tags[$tag] = $tag; } // count appearances of found tags foreach($tags as $tag) { // count number of open tags of this type $openQty = substr_count($str, "<$tag>") + substr_count($str, "<$tag "); // count number of closing tags of this type $closeQty = substr_count($str, ""); // if quantities do not match, mark tag for deletion if($openQty !== $closeQty) { unset($tags[$tag]); $unclosed[] = $tag; } } if(count($unclosed)) { if($remove) { // strip all tags except those where open/close quantity matched $keepTags = count($tags) ? '<' . implode('><', $tags) . '>' : ''; $str = strip_tags($str, $keepTags); } else { foreach($unclosed as $tag) { $str .= ""; } } } return $str; } /** * Collapse string to plain text that all exists on a single long line without destroying words/punctuation. * * @param string $str String to collapse * @param array $options * - `stripTags` (bool): Strip markup tags? (default=true) * - `keepTags` (array): Array of tag names to keep, if stripTags==true. (default=[]) * - `collapseLinesWith` (string): String to collapse newlines with. (default=' ') * - `linksToUrls` (bool): Convert links to "(url)" rather than removing entirely? (default=false) Since 3.0.132 * - `endBlocksWith` (string): Character or string to insert to identify paragraph/header separation (default='') * - `convertEntities` (bool): Convert entity-encoded characters to text? (default=true) * @return mixed|string * */ public function collapse($str, array $options = array()) { $defaults = array( 'stripTags' => true, 'keepTags' => array(), 'collapseLinesWith' => ' ', 'endBlocksWith' => '', 'convertEntities' => true, 'linksToUrls' => false, ); $options = array_merge($defaults, $options); if($options['stripTags']) { $str = $this->markupToText($str, array( 'underlineHeadlines' => false, 'uppercaseHeadlines' => false, 'convertEntities' => $options['convertEntities'], 'linksToUrls' => $options['linksToUrls'], 'keepTags' => $options['keepTags'], )); if(!strlen($str)) return $str; } // character that we collapse lines with $r = $options['collapseLinesWith']; // convert any tabs to space if(strpos($str, "\t") !== false) { $str = str_replace("\t", " ", $str); } // convert CRs to LFs if(strpos($str, "\r") !== false) { $str = str_replace(array("\r\n", "\r"), "\n", $str); } // collapse whitespace that appears before or after newlines while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str); while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str); // convert redundant LFs to no more than double LFs while(strpos($str, "\n\n\n") !== false) { $str = str_replace("\n\n\n", "\n\n", $str); } // add character to indicate blocks, when asked for if(!empty($options['endBlocksWith'])) { $str = str_replace("\n\n", "$options[endBlocksWith]\n\n", $str); } // replace all types of newlines $str = str_replace(array("\r\n", "\r", "\n\n", "\n"), $r, $str); // while there are consecutives of our collapse string, reduce them to one while(strpos($str, "$r$r") !== false) { $str = str_replace("$r$r", $r, $str); } if($r !== $defaults['collapseLinesWith']) { // replacement of whitespace with something other than another single whitespace // so collapse consecutive spaces to one space, since this would not be already done while(strpos($str, " ") !== false) { $str = str_replace(" ", " ", $str); } // use space rather than replacement char when left side already ends with punctuation foreach($this->getPunctuationChars() as $c) { if(strpos($str, "$c$r")) $str = str_replace("$c$r", "$c ", $str); } } return trim($str); } /** * Truncate string to given maximum length without breaking words * * This method can truncate between words, sentences, punctuation or blocks (like paragraphs). * See the `type` option for details on how it should truncate. By default it truncates between * words. Description of types: * * - word: truncate to closest word. * - punctuation: truncate to closest punctuation within sentence. * - sentence: truncate to closest sentence. * - block: truncate to closest block of text (like a paragraph or headline). * * Note that if your specified `type` is something other than “word”, and it cannot be matched * within the maxLength, then it will attempt a different type. For instance, if you specify * “sentence” as the type, and it cannot match a sentence, it will try to match to “punctuation” * instead. If it cannot match that, then it will attempt “word”. * * HTML will be stripped from returned string. If you want to keep some tags use the `keepTags` or `keepFormatTags` * options to specify what tags are allowed to remain. The `keepFormatTags` option that, when true, will make it * retain all HTML inline text formatting tags. * * ~~~~~~~ * // Truncate string to closest word within 150 characters * $s = $sanitizer->truncate($str, 150); * * // Truncate string to closest sentence within 300 characters * $s = $sanitizer->truncate($str, 300, 'sentence'); * * // Truncate with options * $s = $sanitizer->truncate($str, [ * 'type' => 'punctuation', * 'maxLength' => 300, * 'visible' => true, * 'more' => '…' * ]); * ~~~~~~~ * * @param string $str String to truncate * @param int|array $maxLength Maximum length of returned string, or specify $options array here. * @param array|string $options Options array, or specify `type` option (string). * - `type` (string): Preferred truncation type of word, punctuation, sentence, or block. (default='word') * This is a “preferred type”, not an absolute one, because it will adjust to match what it can within your maxLength. * - `maxLength` (int): Max characters for truncation, used only if $options array substituted for $maxLength argument. * - `maximize` (bool): Include as much as possible within specified type and max-length? (default=true) * If you specify false for the maximize option, it will truncate to first word, puncutation, sentence or block. * - `visible` (bool): When true, invisible text (markup, entities, etc.) does not count towards string length. (default=false) * - `trim` (string): Characters to trim from returned string. (default=',;/ ') * - `noTrim` (string): Never trim these from end of returned string. (default=')]>}”»') * - `more` (string): Append this to truncated strings that do not end with sentence punctuation. (default='…') * - `keepTags` (array): HTML tags that should be kept in returned string. (default=[]) * - `keepFormatTags` (bool): Keep HTML text-formatting tags? Simpler alternative to keepTags option. (default=false) * - `collapseLinesWith` (string): String to collapse lines with where the first is not punctuated. (default=' … ') * - `convertEntities` (bool): Convert HTML entities to non-entity characters? (default=false) * - `noEndSentence` (string): Strings that sentence may not end with, space-separated values (default='Mr. Mrs. …') * @return string * */ function truncate($str, $maxLength, $options = array()) { if(!strlen($str)) return ''; $ent = __(true, 'entityEncode', false); $defaults = array( 'type' => 'word', // word, punctuation, sentence, or block 'maximize' => true, // include as much as possible within the type and maxLength (false=include as little as possible) 'visible' => false, // when true, invisible text (markup, entities, etc.) does not count towards string length. (default=false) 'trim' => $this->_(',;/') . ' ', // Trim these characters from the end of the returned string 'noTrim' => $this->_(')]>}”»'), // Never trim these characters from end of returned string 'more' => '…', // Append to truncated strings that do not end with sentence punctuation 'stripTags' => true, // strip HTML tags? (currently required, see keepTags to keep some) 'keepTags' => array(), // if strip HTML tags is true, optional array of tag names you want to keep 'keepFormatTags' => false, // alternative to keepTags: keep just inline text format tags like strong, em, etc. 'collapseWhitespace' => true, // collapsed whitespace (currently required) 'collapseLinesWith' => ' ' . $this->_('…') . ' ', // String placed between joined lines (like from paragraphs) 'convertEntities' => false, // convert entity encoded characters to non-entity equivalents? (default=false) 'noEndSentence' => $this->_('Mr. Mrs. Ms. Dr. Hon. PhD. i.e. e.g.'), // When in sentence type, words that do not end the sentence (space-separated) ); if($ent) __(true, 'entityEncode', $ent); if(is_string($options) && ctype_alpha($options)) { $defaults['type'] = $options; $options = array(); } if(is_array($maxLength)) { $options = $maxLength; if(!isset($options['maxLength'])) $options['maxLength'] = 0; $maxLength = $options['maxLength']; } else if(is_string($maxLength) && ctype_alpha($maxLength)) { $options['type'] = $maxLength; $maxLength = isset($options['maxLength']) ? $options['maxLength'] : $this->strlen($str); } if(!$maxLength) $maxLength = 255; $options = array_merge($defaults, $options); $type = $options['type']; $str = trim($str); $blockEndChar = '¶'; $tests = array(); $punctuationChars = $this->getPunctuationChars(); $endSentenceChars = $this->getPunctuationChars(true); $endSentenceChars[] = ':'; if($options['keepFormatTags']) { $options['keepTags'] = array_merge($options['keepTags'], array( 'abbr','acronym','b','big','cite','code','em','i','kbd', 'q','samp','small','span','strong','sub','sup','time','var', )); } if($type === 'block') { if($this->strpos($str, $blockEndChar) !== false) $str = str_replace($blockEndChar, ' ', $str); $options['endBlocksWith'] = $blockEndChar; } // collapse whitespace and strip tags $str = $this->collapse($str, $options); if(trim($options['collapseLinesWith']) && $this->strpos($str, $options['collapseLinesWith'])) { // if lines are collapsed with something other than whitespace, avoid using that string // when the line already ends with sentence punctuation foreach($endSentenceChars as $c) { $str = str_replace("$c$options[collapseLinesWith]", "$c ", $str); } } // if anything above reduced the length of the string enough, return it now if($this->strlen($str) <= $maxLength) return $str; // get string at maximum possible length if($options['visible']) { // adjust for only visible length $_str = $str; $str = $this->substr($str, 0, $maxLength); $len = $this->getVisibleLength($str); if($len < $maxLength) { $maxLength += ($maxLength - $len); $str = $this->substr($_str, 0, $maxLength); } unset($_str); } else { $str = $this->substr($str, 0, $maxLength); } // match to closest blocks, like paragraph(s) if($type === 'block') { $pos = $options['maximize'] ? $this->strrpos($str, $blockEndChar) : $this->strpos($str, $blockEndChar); if($pos === false) { $type = 'sentence'; } else { $tests[] = $pos; $options['trim'] .= $blockEndChar; } } // find sentences closest to end if($type === 'sentence') { $this->truncateSentenceTests($str, $tests, $endSentenceChars, $options); if(!count($tests)) $type = 'punctuation'; } // find punctuation closes to end of string if($type === 'punctuation') { foreach($punctuationChars as $find) { $pos = $options['maximize'] ? $this->strrpos($str, $find) : $this->strpos($str, $find); if($pos) $tests[] = $pos; } if(!count($tests)) $type = 'word'; } // find whitespace and last word closest to end of string if($type === 'word' || !count($tests)) { $pos = $options['maximize'] ? $this->strrpos($str, ' ') : $this->strpos($str, ' '); if($pos) $tests[] = $pos; } if(count($tests)) { // we found somewhere to truncate, so truncate at the longest one possible if($options['maximize']) { sort($tests); } else { rsort($tests); } // process our tests do { $pos = array_pop($tests); $result = trim($this->substr($str, 0, $pos + 1)); $lastChar = $this->substr($result, -1); $result = $this->rtrim($result, $options['trim']); if($type === 'sentence' || $type === 'block') { // good to go with result as is } else if(in_array($lastChar, $endSentenceChars)) { // good, end with sentence ending punctuation } else if(in_array($lastChar, $punctuationChars)) { $trims = ' '; foreach($punctuationChars as $c) { if($this->strpos($options['noTrim'], $c) !== false) continue; if(in_array($c, $endSentenceChars)) continue; $trims .= $c; } $result = $this->rtrim($result, $trims) . $options['more']; } else { $result .= $options['more']; } } while(!strlen($result) && count($tests)); // make sure we didn't break any HTML tags as a result of truncation if(strlen($result) && count($options['keepTags']) && strpos($result, '<') !== false) { $result = $this->fixUnclosedTags($result); } } else { // if we didn't find any place to truncate, just return exact truncated string $result = $this->trim($str, $options['trim']) . $options['more']; } if(strlen($options['more'])) { // remove any duplicated more strings $more = $options['more']; while(strpos($result, "$more$more") !== false) { $result = str_replace("$more$more", "$more", $result); } } return $result; } /** * Helper to truncate() method, generate tests/positions for where sentences end * * @param string $str * @param array $tests Tests to append found positions to * @param array $endSentenceChars * @param array $options Options provided to truncate method * */ protected function truncateSentenceTests($str, array &$tests, array $endSentenceChars, array $options) { $chars = $endSentenceChars; $thisStr = $str; $nextStr = ''; $nextOffset = 0; $offset = 0; // offset used for maximize==false mode only $n = 0; // regex matches specified words, plus digits or single letters followed by period $noEndRegex = '!\b(' . str_replace(' ', '|', preg_quote($options['noEndSentence'])) . '|\d+\.|\w\.)$!'; do { if($nextStr) { $offset = $nextOffset; $thisStr = $nextStr; $nextStr = ''; $chars = array('.'); } foreach($chars as $find) { $pos = $options['maximize'] ? $this->strrpos($thisStr, "$find ") : $this->strpos($thisStr, "$find ", $offset); if(!$pos) continue; if($find === '.') { $testStr = $this->substr($thisStr, 0, $pos + 1); if(preg_match($noEndRegex, $testStr, $matches)) { // ends with a disallowed word, next time try to match with a shorter string if($options['maximize']) { $nextStr = $this->substr($testStr, 0, $this->strlen($testStr) - $this->strlen($matches[1]) - 1); } else { $nextOffset = $this->strlen($testStr); } continue; } } $tests[] = $pos; } } while(strlen($nextStr) && ++$n < 3); } /** * Return visible length of string, which is length not counting markup or entities * * @param string $str * @return int * */ public function getVisibleLength($str) { if(strpos($str, '>')) { $str = strip_tags($str); } if(strpos($str, '&') !== false && strpos($str, ';')) { $str = html_entity_decode($str, ENT_QUOTES, 'UTF-8'); } return $this->strlen($str); } /** * Get array of punctuation characters * * @param bool $sentence Get only sentence-ending punctuation * @return array|string * */ public function getPunctuationChars($sentence = false) { $ent = __(true, 'entityEncode', false); if($sentence) { $s = $this->_('. ? !'); // Sentence ending punctuation characters (must be space-separated) } else { $s = $this->_(', : . ? ! “ ” „ " – -- ( ) [ ] { } « »'); // All punctuation characters (must be space-separated) } if($ent) __(true, 'entityEncode', $ent); return explode(' ', $s); } /** * Get alternate words for given word * * This method does not do anything unless an implementation is provided by a module (or something else) * hooking the protected `WireTextTools::wordAlternates($word, $options)` method. Implementation should * populate $event->return with any or all of the following (as available): * * - Word plural(s) * - Word singular(s) * - Word Lemmas * - Word Synonyms * - Anything else applicable to current $user->language * * See the protected WireTextTools::wordAlternates() method for hook instructions and an example. * * @param string $word * @param array $options * - `operator` (string): Operator being used, if applicable (default='') * - `minLength` (int): Minimum word length to return in alternates (default=2) * - `lowercase` (bool): Convert words to lowercase, if not already (default=false) * @return array * @since 3.0.162 * @see WireTextTools::getWordStem() * */ public function getWordAlternates($word, array $options = array()) { if(!$this->hasHook('wordAlternates()')) return array(); $defaults = array( 'operator' => '', 'minLength' => 2, 'lowercase' => false, ); $options = array_merge($defaults, $options); $word = $this->trim($word); $words = array(); $wordLow = $this->strtolower($word); if($options['lowercase']) $word = $wordLow; if(empty($word)) return array(); $alternates = $this->wordAlternates($word, $options); if(!count($alternates)) return array(); // if original word appears in return value, remove it $key = array_search($word, $alternates); if($key !== false) unset($alternates[$key]); // populate $words, removing any invalid or duplicate values foreach($alternates as $w) { if(!is_string($w)) continue; $w = $this->trim($w); $wLow = $this->strtolower($w); if($wLow === $wordLow) continue; // dup of original word if($options['lowercase']) $w = $wLow; // use lowercase if($this->strlen($w) < $options['minLength']) continue; // too short if(isset($words[$wLow])) continue; // already have it $words[$wLow] = $w; } return array_values($words); } /** * Hookable method to return alternate words for given word * * This hookable method is separate from the public getWordAlternates() method so that * we can provide predictable and already-populated $options to whatever is hooking this, as * as provide some additional QA with the return value from modules/hooks. * * It is fine if the return value contains duplicates, the original word, or too-short words, * as the calling getWordAlternates() takes care of those before returning words to user. * Basically, hooks can ignore the `$options` argument, unless they need to know the `operator`, * which may or may not be provided by the caller. * * In hook implementation, avoid deleting what’s already present in $event->return just in * case multiple hooks are adding words. * * ~~~~~ * // Contrived example of how to implement * $wire->addHookAfter('WireTextTools::wordAlternates', function(HookEvent $event) { * $word = $event->arguments(0); // string: word requested alternates for * $words = $event->return; // array: existing return value * * $cats = [ 'cat', 'cats', 'kitty', 'feline', 'felines' ]; * $dogs = [ 'dog', 'dogs', 'doggy', 'canine', 'canines' ]; * * if(in_array($word, $cats)) { * $words = array_merge($words, $cats); * } else if(in_array($word, $dogs)) { * $words = array_merge($words, $dogs); * } * * $event->return = $words; * }); * * // Test it out * $words = $sanitizer->getTextTools()->getWordAlternates('cat'); * echo implode(', ', $words); // outputs: cats, kitty, kitten, feline, felines * ~~~~~ * * #pw-hooker * * @param string $word * @param array $options * - `operator` (string): Operator being used, if applicable (default='') * @return array * @since 3.0.162 * */ protected function ___wordAlternates($word, array $options) { if($word && $options) {} // ignore $alternates = array(); return $alternates; } /** * Find and return all {placeholder} tags found in given string * * @param string $str String that might contain field {tags} * @param array $options * - `has` (bool): Specify true to only return true or false if it has tags (default=false). * - `tagOpen` (string): The required opening tag character(s), default is '{' * - `tagClose` (string): The required closing tag character(s), default is '}' * @return array|bool * @since 3.0.126 * */ public function findPlaceholders($str, array $options = array()) { $defaults = array( 'has' => false, 'tagOpen' => '{', 'tagClose' => '}', ); $options = array_merge($defaults, $options); $tags = array(); $pos1 = strpos($str, $options['tagOpen']); if($pos1 === false) return $options['has'] ? false : $tags; if(strlen($options['tagClose'])) { $pos2 = strpos($str, $options['tagClose']); if($pos2 === false) return $options['has'] ? false : $tags; } $regex = '/' . preg_quote($options['tagOpen']) . '([-_.|a-zA-Z0-9]+)' . preg_quote($options['tagClose']) . '/'; if($options['has']) return (bool) preg_match($regex, $str); if(!preg_match_all($regex, $str, $matches)) return $tags; foreach($matches[0] as $key => $tag) { $name = $matches[1][$key]; $tags[$name] = $tag; } return $tags; } /** * Does the string have any {placeholder} tags in it? * * @param string $str * @param array $options * - `tagOpen` (string): The required opening tag character(s), default is '{' * - `tagClose` (string): The required closing tag character(s), default is '}' * @return bool * @since 3.0.126 * */ public function hasPlaceholders($str, array $options = array()) { $options['has'] = true; return $this->findPlaceholders($str, $options); } /** * Given a string ($str) and values ($vars), populate placeholder “{tags}” in the string with the values * * - The `$vars` should be an associative array of `[ 'tag' => 'value' ]`. * - The `$vars` may also be an object, in which case values will be pulled as properties of the object. * * By default, tags are specified in the format: {first_name} where first_name is the name of the * variable to pull from $vars, `{` is the opening tag character, and `}` is the closing tag char. * * The tag parser can also handle subfields and OR tags, if `$vars` is an object that supports that. * For instance `{products.title}` is a subfield, and `{first_name|title|name}` is an OR tag. * * ~~~~~ * $vars = [ 'foo' => 'FOO!', 'bar' => 'BAR!' ]; * $str = 'This is a test: {foo}, and this is another test: {bar}'; * echo $sanitizer->getTextTools()->populatePlaceholders($str, $vars); * // outputs: This is a test: FOO!, and this is another test: BAR! * ~~~~~ * * @param string $str The string to operate on (where the {tags} might be found) * @param WireData|object|array $vars Object or associative array to pull replacement values from. * @param array $options Array of optional changes to default behavior, including: * - `tagOpen` (string): The required opening tag character(s), default is '{' * - `tagClose` (string): The optional closing tag character(s), default is '}' * - `recursive` (bool): If replacement value contains tags, populate those too? (default=false) * - `removeNullTags` (bool): If a tag resolves to a NULL, remove it? If false, tag will remain. (default=true) * - `entityEncode` (bool): Entity encode the values pulled from $vars? (default=false) * - `entityDecode` (bool): Entity decode the values pulled from $vars? (default=false) * - `allowMarkup` (bool): Allow markup to appear in populated variables? (default=true) * @return string String with tags populated. * @since 3.0.126 Use wirePopulateStringTags() function for older versions * */ public function populatePlaceholders($str, $vars, array $options = array()) { $defaults = array( 'tagOpen' => '{', // opening tag (required) 'tagClose' => '}', // closing tag (optional) 'recursive' => false, // if replacement value contains tags, populate those too? 'removeNullTags' => true, // if a tag value resolves to a NULL, remove it? If false, tag will be left in tact. 'entityEncode' => false, // entity encode values pulled from $vars? 'entityDecode' => false, // entity decode values pulled from $vars? 'allowMarkup' => true, // allow markup to appear in populated variables? ); $options = array_merge($defaults, $options); $optionsNoRecursive = $options['recursive'] ? array_merge($options, array('recursive' => false)) : $options; $replacements = array(); $tags = $this->findPlaceholders($str, $options); // create a list of replacements by finding replacement values in $vars foreach($tags as $fieldName => $tag) { if(isset($replacements[$tag])) continue; // if already found, do not do it again $fieldValue = null; if(is_object($vars)) { if($vars instanceof Page) { $fieldValue = $options['allowMarkup'] ? $vars->getMarkup($fieldName) : $vars->getText($fieldName); } else if($vars instanceof WireData) { $fieldValue = $vars->get($fieldName); } else { $fieldValue = $vars->$fieldName; } } else if(is_array($vars)) { $fieldValue = isset($vars[$fieldName]) ? $vars[$fieldName] : null; } // if value resolves to null and we are not removing null tags, then do not add to replacements if($fieldValue === null && !$options['removeNullTags']) continue; $fieldValue = (string) $fieldValue; if(!$options['allowMarkup'] && strpos($fieldValue, '<') !== false) $fieldValue = strip_tags($fieldValue); if($options['entityEncode']) $fieldValue = htmlentities($fieldValue, ENT_QUOTES, 'UTF-8', false); if($options['entityDecode']) $fieldValue = html_entity_decode($fieldValue, ENT_QUOTES, 'UTF-8'); if($options['recursive'] && strpos($fieldValue, $options['tagOpen']) !== false) { $fieldValue = $this->populatePlaceholders($fieldValue, $vars, $optionsNoRecursive); } $replacements[$tag] = $fieldValue; } // replace the tags if(count($tags)) { $str = str_replace(array_keys($replacements), array_values($replacements), $str); } return $str; } /** * Populate placeholders in string with sanitizers applied to populated values * * These placeholders accept one or more sanitizer names as part `{placeholder}` in the format `{placeholder:sanitizers}`, * where `placeholder` is the name of a variable accessible from `$data` argument and `sanitizers` is the name of a * sanitizer method or a CSV string of sanitizer methods. Placeholders with any whitespace are ignored. * * #pw-internal * * ~~~~~ * $tools = $sanitizer->getTextTools(); * $data = [ 'name' => 'John Smith', 'age' => 46.5 ]; * * $str = "My name is {name:camelCase}, my age is {age:int}"; * echo $tools->placeholderSanitizers($str, $data); // outputs: My name is johnBobSmith, my age is 46 * * $str = "My name is {name:removeWhitespace,entities}, my age is {age:float}"; * echo $tools->placeholderSanitizers($str, $data); // outputs: My name is John<Bob>Smith, my age is 46.5 * * $str = "My name is {name:text,word}, my age is {age:digits}"; * echo $tools->placeholderSanitizers($str, $data); // outputs: My name is John, my age is 465 * ~~~~~ * * @param string $str * @param array|WireData|WireInputData * @param array $options * @return string * @throws WireException * @since 3.0.178 * @todo currently 'protected' for later use * */ protected function placeholderSanitizers($str, $data, array $options = array()) { $defaults = array( 'tagOpen' => '{', 'tagClose' => '}', 'sanitizersBefore' => array('string'), // sanitizers to apply before requested ones 'sanitizersAfter' => array(), // sanitizers to apply after requested ones 'sanitizersDefault' => array('text'), // defaults if only {var} is presented without {var:sanitizer} ); $options = array_merge($defaults, $options); $sanitizer = $this->wire()->sanitizer; $dataIsArray = is_array($data); $replacements = array(); $parts = array(); if(strpos($str, $options['tagOpen']) === false || !strpos($str, $options['tagClose'])) return $str; if(!is_array($data) && !$data instanceof WireData && !$data instanceof WireInputData) { throw new WireException('$data argument must be associative array, WireData or WireInputData'); } list($tagOpen, $tagClose) = array(preg_quote($options['tagOpen']), preg_quote($options['tagClose'])); $regex = '/OPEN([-_.a-z0-9]+)(:[_,a-z0-9]+CLOSE|CLOSE)/i'; $regex = str_replace(array('OPEN', 'CLOSE'), array($tagOpen, $tagClose), $regex); if(!preg_match_all($regex, $str, $matches)) return $str; foreach($matches[0] as $key => $placeholder) { $varName = $matches[1][$key]; $sanitizers = trim($matches[2][$key], ':}'); $sanitizers = strlen($sanitizers) ? explode(',', $sanitizers) : array(); if(!count($sanitizers)) $sanitizers = $options['sanitizersDefault']; if($dataIsArray) { /** @var array $data */ $value = isset($data[$varName]) ? $data[$varName] : null; } else { /** @var WireData|WireInputData $data */ $value = $data->get($varName); } $n = 0; foreach(array($options['sanitizersBefore'], $sanitizers, $options['sanitizersAfter']) as $methods) { foreach($methods as $method) { if(!$sanitizer->methodExists($method)) throw new WireException("Unknown sanitizer method: $method"); $value = $sanitizer->sanitize($value, $method); $n++; } } if(!$n) $value = $placeholder; $replacements[] = array($placeholder, $value); } // piece it back together manually so values in $data cannot introduce more placeholders foreach($replacements as $item) { list($placeholder, $value) = $item; list($before, $after) = explode($placeholder, $str, 2); $parts[] = $before . $value; $str = $after; } return implode('', $parts) . $str; } /** * Populate placeholders with optional sanitizers in a selector string * * #pw-internal * * @param string $selectorString * @param array|WireData|WireInputData * @param array $options * @return string * @throws WireException * @since 3.0.178 * @todo currently 'protected' for later use * */ protected function placeholderSelector($selectorString, $data, array $options = array()) { if(!isset($options['sanitizersBefore'])) $options['sanitizersBefore'] = array(); if(!isset($options['sanitizersAfter'])) $options['sanitizersAfter'] = array(); $options['sanitizersBefore'][] = 'text'; $options['sanitizersAfter'][] = 'selectorValue'; return $this->placeholderSanitizers($selectorString, $data, $options); } /** * Given two arrays, return array of the changes with 'ins' and 'del' keys * * Based upon Paul Butler’s Simple Diff Algorithm v0.1 © 2007 (zlib/libpng) https://paulbutler.org * * @param array $oldArray * @param array $newArray * @return array * @since 3.0.144 * */ protected function diffArray(array $oldArray, array $newArray) { $matrix = array(); $maxLen = 0; $oldMax = 0; $newMax = 0; foreach($oldArray as $oldKey => $oldValue){ $newKeys = array_keys($newArray, $oldValue); foreach($newKeys as $newKey) { $len = 1; if(isset($matrix[$oldKey - 1][$newKey - 1])) { $len = $matrix[$oldKey - 1][$newKey - 1] + 1; } $matrix[$oldKey][$newKey] = $len; if($len > $maxLen) { $maxLen = $len; $oldMax = $oldKey + 1 - $maxLen; $newMax = $newKey + 1 - $maxLen; } } } if($maxLen == 0) { $result = array( array('del' => $oldArray, 'ins' => $newArray) ); } else { $result = array_merge( $this->diffArray( array_slice($oldArray, 0, $oldMax), array_slice($newArray, 0, $newMax) ), array_slice($newArray, $newMax, $maxLen), $this->diffArray( array_slice($oldArray, $oldMax + $maxLen), array_slice($newArray, $newMax + $maxLen) ) ); } return $result; } /** * Given two strings ($old and $new) return a diff string in HTML markup * * @param string $old Old string value * @param string $new New string value * @param array $options Options to modify behavior: * - `ins` (string) Markup to use for diff insertions (default: `{out}`) * - `del` (string) Markup to use for diff deletions (default: `{out}`) * - `entityEncode` (bool): Entity encode values, other than added ins/del tags? (default=true) * - `split` (string): Regex used to split strings for parts to diff (default=`\s+`) * @return string * @since 3.0.144 * */ public function diffMarkup($old, $new, array $options = array()) { $defaults = array( 'ins' => "{out}", 'del' => "{out}", 'entityEncode' => true, 'split' => '\s+', ); /** @var Sanitizer $sanitizer */ $sanitizer = $this->wire('sanitizer'); list($old, $new) = array("$old", "$new"); // enforce as string $options = array_merge($defaults, $options); $oldArray = preg_split("!($options[split])!", $old, 0, PREG_SPLIT_DELIM_CAPTURE); $newArray = preg_split("!($options[split])!", $new, 0, PREG_SPLIT_DELIM_CAPTURE); $diffArray = $this->diffArray($oldArray, $newArray); list(,$delClose) = explode('{out}', $options['del'], 2); list($insOpen,) = explode('{out}', $options['ins'], 2); $out = ''; foreach($diffArray as $diff) { if(is_array($diff)) { foreach(array('del', 'ins') as $key) { if(empty($diff[$key])) continue; $diffStr = implode('', $diff[$key]); if($options['entityEncode']) $diffStr = $sanitizer->entities1($diffStr); $out .= str_replace('{out}', $diffStr, $options[$key]); } } else { $out .= ($options['entityEncode'] ? $sanitizer->entities1($diff) : $diff); } } if(strpos($out, "$delClose$insOpen")) { // put a space between '' so that it is ' ' $out = str_replace("$delClose$insOpen", "$delClose $insOpen", $out); } return $out; } /** * Find escaped characters in $str, replace them with a placeholder, and return the placeholders * * Usage * ~~~~~ * // 1. Escape certain chars in a string that you want to survive some processing: * $str = 'Hello \*world\* foo \"bar\" baz'; * * // 2. Use this method to find escape chars and replace them temporarily: * $a = $sanitizer->getTextTools()->findReplaceEscapeChars($str, [ '*', '"' ]); * * // 3. Process string with anything that you want NOT to see chars that were escaped: * $str = some_function_that_processes_the_string($str); * * // 4. Do this to restore the escaped chars (restored without backslashes by default): * $str = str_replace(array_keys($a), array_values($a), $str); * ~~~~~ * * @param string &$str String to find escape chars in, it will be modified directly (passed by reference) * @param array $escapeChars Array of chars you want to escape i.e. [ '*', '[', ']', '(', ')', '`', '_', '\\', '"' ] * @param array $options Options to modify behavior: * - `escapePrefix` (string): Character used to escape another character (default is backslash). * - `restoreEscape` (bool): Should returned array also include the escape prefix, so escapes are restored? (default=false) * - `gluePrefix` (string): Prefix for placeholders we substitute for escaped characters (default='{ESC') * - `glueSuffix` (string): Suffix for placeholders we substitute for escaped characters (default='}') * - `unescapeUnknown` (bool): If we come across escaped char not in your $escapeChars list, unescape it? (default=false) * - `removeUnknown` (bool): If we come across escaped char not in your $escapeChars list, remove the escape and char? (default=false) * @return array Returns assoc array where keys are placeholders substituted in $str and values are escaped characters. * @since 3.0.162 * */ public function findReplaceEscapeChars(&$str, array $escapeChars, array $options = array()) { $defaults = array( 'escapePrefix' => '\\', 'restoreEscape' => false, // when restoring, also restore escape prefix? 'gluePrefix' => '{ESC', 'glueSuffix' => '}', 'unescapeUnknown' => false, 'removeUnknown' => false, ); $options = array_merge($defaults, $options); $escapePrefix = $options['escapePrefix']; if(strpos($str, $escapePrefix) === false) return array(); $escapes = array(); $glueSuffix = $options['glueSuffix']; $parts = explode($escapePrefix, $str); $n = 0; do { $gluePrefix = $options['gluePrefix'] . $n; } while($this->strpos($str, $gluePrefix) !== false && ++$n); $str = array_shift($parts); foreach($parts as $key => $part) { $len = $this->strlen($part); $char = $len > 0 ? $this->substr($part, 0, 1) : ''; // char being escaped $part = $len > 1 ? $this->substr($part, 1) : ''; // everything after it $charKey = array_search($char, $escapeChars); // find placeholder (glue) if($charKey !== false) { // replace escaped char with placeholder ($glue) $glue = $gluePrefix . $charKey . $glueSuffix; $escapes[$glue] = $options['restoreEscape'] ? $escapePrefix . $char : $char; $str .= $glue . $part; } else if($options['unescapeUnknown']) { // unescape unknown escape char $str .= $char . $part; } else if($options['removeUnknown']) { // remove unknown escape char $str .= $part; } else { // some other backslash that’s allowed, restore back as it was $str .= $escapePrefix . $char . $part; } } return $escapes; } /*********************************************************************************************************** * MULTIBYTE PHP STRING FUNCTIONS THAT FALLBACK WHEN MBSTRING NOT AVAILABLE * * These duplicate the equivalent PHP string methods and use exactly the same arguments * and exhibit exactly the same behavior. The only difference is that these methods using * the multibyte string versions when they are available, and fallback to the regular PHP * string methods when not. Use these functions only when that behavior is okay. * */ /** * Get part of a string * * #pw-group-PHP-function-alternates * * @param string $str * @param int $start * @param int|null $length Max chars to use from str. If omitted or NULL, extract all characters to the end of the string. * @return string * @see https://www.php.net/manual/en/function.substr.php * */ public function substr($str, $start, $length = null) { return $this->mb ? mb_substr($str, $start, $length) : substr($str, $start, $length); } /** * Find position of first occurrence of string in a string * * #pw-group-PHP-function-alternates * * @param string $haystack * @param string $needle * @param int $offset * @return bool|false|int * @see https://www.php.net/manual/en/function.strpos.php * */ public function strpos($haystack, $needle, $offset = 0) { return $this->mb ? mb_strpos($haystack, $needle, $offset) : strpos($haystack, $needle, $offset); } /** * Find the position of the first occurrence of a case-insensitive substring in a string * * #pw-group-PHP-function-alternates * * @param string $haystack * @param string $needle * @param int $offset * @return bool|false|int * @see https://www.php.net/manual/en/function.stripos.php * */ public function stripos($haystack, $needle, $offset = 0) { return $this->mb ? mb_stripos($haystack, $needle, $offset) : stripos($haystack, $needle, $offset); } /** * Find the position of the last occurrence of a substring in a string * * #pw-group-PHP-function-alternates * * @param string $haystack * @param string $needle * @param int $offset * @return bool|false|int * @see https://www.php.net/manual/en/function.strrpos.php * */ public function strrpos($haystack, $needle, $offset = 0) { return $this->mb ? mb_strrpos($haystack, $needle, $offset) : strrpos($haystack, $needle, $offset); } /** * Find the position of the last occurrence of a case-insensitive substring in a string * * #pw-group-PHP-function-alternates * * @param string $haystack * @param string $needle * @param int $offset * @return bool|false|int * @see https://www.php.net/manual/en/function.strripos.php * */ public function strripos($haystack, $needle, $offset = 0) { return $this->mb ? mb_strripos($haystack, $needle, $offset) : strripos($haystack, $needle, $offset); } /** * Get string length * * #pw-group-PHP-function-alternates * * @param string $str * @return int * @see https://www.php.net/manual/en/function.strlen.php * */ public function strlen($str) { return $this->mb ? mb_strlen($str) : strlen($str); } /** * Make a string lowercase * * #pw-group-PHP-function-alternates * * @param string $str * @return string * @see https://www.php.net/manual/en/function.strtolower.php * */ public function strtolower($str) { return $this->mb ? mb_strtolower($str) : strtolower($str); } /** * Make a string uppercase * * #pw-group-PHP-function-alternates * * @param string $str * @return string * @see https://www.php.net/manual/en/function.strtoupper.php * */ public function strtoupper($str) { return $this->mb ? mb_strtoupper($str) : strtoupper($str); } /** * Count the number of substring occurrences * * #pw-group-PHP-function-alternates * * @param string $haystack * @param string $needle * @return int * @see https://www.php.net/manual/en/function.substr-count.php * */ public function substrCount($haystack, $needle) { return $this->mb ? mb_substr_count($haystack, $needle) : substr_count($haystack, $needle); } /** * Find the first occurrence of a string * * #pw-group-PHP-function-alternates * * @param string $haystack * @param string $needle * @param bool $beforeNeedle Return part of haystack before first occurrence of the needle? (default=false) * @return false|string * @see https://www.php.net/manual/en/function.strstr.php * */ public function strstr($haystack, $needle, $beforeNeedle = false) { return $this->mb ? mb_strstr($haystack, $needle, $beforeNeedle) : strstr($haystack, $needle, $beforeNeedle); } /** * Find the first occurrence of a string (case insensitive) * * #pw-group-PHP-function-alternates * * @param string $haystack * @param string $needle * @param bool $beforeNeedle Return part of haystack before first occurrence of the needle? (default=false) * @return false|string * @see https://www.php.net/manual/en/function.stristr.php * */ public function stristr($haystack, $needle, $beforeNeedle = false) { return $this->mb ? mb_stristr($haystack, $needle, $beforeNeedle) : stristr($haystack, $needle, $beforeNeedle); } /** * Find the last occurrence of a character in a string * * #pw-group-PHP-function-alternates * * @param string $haystack * @param string $needle Only first given character used * @return false|string * @see https://www.php.net/manual/en/function.strrchr.php * */ public function strrchr($haystack, $needle) { return $this->mb ? mb_strrchr($haystack, $needle) : strrchr($haystack, $needle); } /** * Strip whitespace (or other characters) from the beginning and end of a string * * #pw-group-PHP-function-alternates * * @param string $str * @param string $chars Omit for default * @return string * */ public function trim($str, $chars = '') { if(!$this->mb) return $chars === '' ? trim($str) : trim($str, $chars); return $this->wire()->sanitizer->trim($str, $chars); } /** * Strip whitespace (or other characters) from the beginning of string only (aka left trim) * * #pw-group-PHP-function-alternates * * @param string $str * @param string $chars Omit for default * @return string * @since 3.0.168 * */ public function ltrim($str, $chars = '') { if(!$this->mb) return $chars === '' ? ltrim($str) : ltrim($str, $chars); return $this->wire()->sanitizer->trim($str, $chars, 'ltrim'); } /** * Strip whitespace (or other characters) from the end of string only (aka right trim) * * #pw-group-PHP-function-alternates * * @param string $str * @param string $chars Omit for default * @return string * @since 3.0.168 * */ public function rtrim($str, $chars = '') { if(!$this->mb) return $chars === '' ? rtrim($str) : rtrim($str, $chars); return $this->wire()->sanitizer->trim($str, $chars, 'rtrim'); } }