text($dirtyValue); * ~~~~~~ * You can replace the `text()` call above with any other sanitizer method. Many sanitizer methods also accept additional * arguments—see each individual method for details. * * ### Sanitizer and input * * Sanitizer methods are most commonly used with user input. As a result, the methods in this class are also accessible * from the `$input->get`, `$input->post` and `$input->cookie` API variables, in the same manner that they are here. * This is a useful shortcut for instances where you don’t need to provide additional arguments to the sanitizer method. * Below are a few examples of this usage: * ~~~~~ * // get GET variable 'id' as integer * $id = $input->get->int('id'); * * // get POST variable 'name' as 1-line plain text * $name = $input->post->text('name'); * * // get POST variable 'comments' as multi-line plain text * $comments = $input->post->textarea('comments'); * ~~~~~ * In ProcessWire 3.0.125 and newer you can also perform the same task as the above with one less `->` level like the * example below: * ~~~~~ * $comments = $input->post('comments','textarea'); * ~~~~~ * This is more convenient in some IDEs because it’ll never be flagged as an unrecognized function call. Though outside * of that it makes little difference how you call it, as they both do the same thing. * * See the `$input` API variable for more details on how to call sanitizers directly from $input. * * ### Adding your own sanitizers * * You can easily add your own new sanitizers via ProcessWire hooks. Hooks are commonly added in a /site/ready.php file, * or from a Module, though you may add them wherever you want. The following example adds a sanitizer method called * `zip()` which enforces a 5 digit zip code: * ~~~~~ * $sanitizer->addHook('zip', function(HookEvent $event) { * $sanitizer = $event->object; * $value = $event->arguments(0); // get first argument given to method * $value = $sanitizer->digits($value, 5); // allow only digits, max-length 5 * if(strlen($value) < 5) $value = ''; // if fewer than 5 digits, it is not a zip * $event->return = $value; * }); * * // now you can use your zip sanitizer * $dirtyValue = 'Decatur GA 30030'; * $cleanValue = $sanitizer->zip($dirtyValue); * echo $cleanValue; // outputs: 30030 * ~~~~~ * * ### Additional options (3.0.125 or newer) * * In ProcessWire 3.0.125+ you can also combine sanitizer methods in a single call. These are defined by separating each * sanitizer method with an understore. The example below runs the value through the text sanitizer and then through the * entities sanitizer: * ~~~~~ * $cleanValue = $sanitizer->text_entities($dirtyValue); * ~~~~~ * If you append a number to any sanitizer call that returns a string, it is assumed to be maximum allowed length. For * example, the following would sanitize the value to be text of no more than 20 characters: * ~~~~~ * $cleanValue = $sanitizer->text20($dirtyValue); * ~~~~~ * The above technique also works for any user-defined sanitizers you’ve added via hooks. We like this strategy for * storage of sanitizer calls that are executed at some later point, like those you might store in a module config. It * essentially enables you to define loose data types for sanitization. In addition, if there are other cases where you * need multiple sanitizers to clean a particular value, this strategy can do it with a lot less code than you would * with multiple sanitizer calls. * * Most methods in the Sanitizer class focus on sanitization rather than validation, with a few exceptions. You can * convert a sanitizer call to validation call by calling the `validate()` method with the name of the sanitizer and the * value. A validation call simply implies that if the value is modified by sanitization then it is considered invalid * and thus it’ll return a non-value rather than a sanitized value. See the `Sanitizer::validate()` and * `Sanitizer::valid()` methods for usage details. * * #pw-body * * ProcessWire 3.x, Copyright 2022 by Ryan Cramer * https://processwire.com * * @link https://processwire.com/api/variables/sanitizer/ Offical $sanitizer API variable Documentation * * @method array($value, $sanitizer = null, array $options = array()) * @method array testAll($value) * */ class Sanitizer extends Wire { /** * Constant used for the $beautify argument of name sanitizer methods to indicate transliteration may be used. * */ const translate = 2; /** * Beautify argument for pageName() to IDN encode UTF8 to ascii * #pw-internal * */ const toAscii = 4; /** * Beautify argument for pageName() to allow decode IDN ascii to UTF8 * #pw-internal * */ const toUTF8 = 8; /** * Beautify argument for pageName() to indicate that UTF8 (in whitelist) is allowed * * Unlike the toUTF8 option, no ascii to UTF8 conversion is allowed. * #pw-internal * */ const okUTF8 = 16; /** * Caches the status of multibyte support. * */ protected $multibyteSupport = false; /** * Array of allowed ascii characters for name filters * */ protected $allowedASCII = array(); /** * ASCII alpha chars * * @var string * */ protected $alphaASCII = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; /** * ASCII digits chars * * @var string * */ protected $digitASCII = '0123456789'; /** * @var null|WireTextTools * */ protected $textTools = null; /** * @var null|WireNumberTools * */ protected $numberTools = null; /** * Runtime caches * * @var array * */ protected $caches = array(); /** * UTF-8 whitespace hex codes * * @var array * */ protected $whitespaceUTF8 = array( '0000', // null byte '0009', // character tab '000A', // line feed '000B', // line tab (vertical tab) '000C', // form feed '000D', // carriage return '0020', // space '0085', // next line '00A0', // non-breaking space '1680', // ogham space mark '180E', // mongolian vowel separator '2000', // en quad '2001', // em quad '2002', // en space '2003', // em space '2004', // three per em space '2005', // four per em space '2006', // six per em space '2007', // figure space '2008', // punctuation space '2009', // thin space '200A', // hair space '200B', // zero width space '200C', // zero width non-join '200D', // zero width join '2028', // line seperator '2029', // paragraph seperator '202F', // narrow non-breaking space '205F', // medium mathematical space '2060', // word join '3000', // ideographic space 'FEFF', // zero width non-breaking space ); /** * HTML entities representing whitespace * * Note that this array is populated with all decimal/hex entities after a call to * getWhitespaceArray() method with the $html option as true. * * @var array * */ protected $whitespaceHTML = array( ' ', // non-breaking space ' ', // en space ' ', // em space ' ', // thin space '‌', // zero width non-join '‍', // zero width join ); /** * Sanitizer method names (A-Z) and type(s) they return * * a: array * b: boolean * f: float * i: integer * m: mixed * n: null * s: string * * @var array * */ protected $sanitizers = array( 'alpha' => 's', 'alphanumeric' => 's', 'array' => 'a', 'arrayVal' => 'a', 'attrName' => 's', 'bit' => 'i', 'bool' => 'b', 'camelCase' => 's', 'chars' => 's', 'checkbox' => 'b', 'date' => 'ins', 'digits' => 's', 'email' => 's', 'emailHeader' => 's', 'entities' => 's', 'entities1' => 's', 'entitiesA' => 'asifb', 'entitiesA1' => 'asifb', 'entitiesMarkdown' => 's', 'fieldName' => 's', 'fieldSubfield' => 's', 'filename' => 's', 'flatArray' => 'a', 'float' => 'f', 'htmlClass' => 's', 'htmlClasses' => 's', 'httpUrl' => 's', 'hyphenCase' => 's', 'int' => 'i', 'intArray' => 'a', 'intArrayVal' => 'a', 'intSigned' => 'i', 'intUnsigned' => 'i', 'kebabCase' => 's', 'line' => 's', 'lines' => 's', 'markupToLine' => 's', 'markupToText' => 's', 'max' => 'fi', 'maxBytes' => 's', 'maxLength' => 'afis', 'minLength' => 's', 'min' => 'fi', 'minArray' => 'a', 'name' => 's', 'names' => 'as', 'normalizeWhitespace' => 's', 'pageName' => 's', 'pageNameTranslate' => 's', 'pageNameUTF8' => 's', 'pagePathName' => 's', 'pagePathNameUTF8' => 's', 'pascalCase' => 's', 'path' => 'bs', 'purify' => 's', 'range' => 'fi', 'reduceWhitespace' => 's', 'removeMB4' => 'ams', 'removeNewlines' => 's', 'removeWhitespace' => 's', 'sanitize' => 'm', 'selectorField' => 's', 'selectorValue' => 's', 'selectorValueAdvanced' => 's', 'snakeCase' => 's', 'string' => 's', 'templateName' => 's', 'text' => 's', 'textarea' => 's', 'textdomain' => 's', 'trim' => 's', 'truncate' => 's', 'unentities' => 's', 'url' => 's', 'valid' => 'b', 'validate' => 'm', 'varName' => 's', 'word' => 's', 'words' => 's', 'wordsArray' => 'a', ); /** * Construct the sanitizer * */ public function __construct() { parent::__construct(); $this->multibyteSupport = function_exists("mb_internal_encoding"); if($this->multibyteSupport) mb_internal_encoding("UTF-8"); $this->allowedASCII = str_split($this->alphaASCII . $this->digitASCII); } /************************************************************************************************************* * STRING SANITIZERS * */ /** * Internal filter used by other name filtering methods in this class * * #pw-internal * * @param string $value Value to filter * @param array $allowedExtras Additional characters that are allowed in the value * @param string 1 character replacement value for invalid characters * @param bool $beautify Whether to beautify the string, specify `Sanitizer::translate` to perform transliteration. * @param int $maxLength * @return string * */ public function nameFilter($value, array $allowedExtras, $replacementChar, $beautify = false, $maxLength = 128) { static $replacements = array(); if(!is_string($value)) $value = $this->string($value); $allowed = array_merge($this->allowedASCII, $allowedExtras); $needsWork = strlen(str_replace($allowed, '', $value)); $extras = implode('', $allowedExtras); if($beautify && $needsWork) { if($beautify === self::translate && $this->multibyteSupport) { $value = mb_strtolower($value); if(empty($replacements)) { $modules = $this->wire()->modules; if($modules) { $configData = $this->wire()->modules->getModuleConfigData('InputfieldPageName'); $replacements = empty($configData['replacements']) ? InputfieldPageName::$defaultReplacements : $configData['replacements']; } } foreach($replacements as $from => $to) { if(mb_strpos($value, $from) !== false) { $value = mb_eregi_replace($from, $to, $value); } } } if(function_exists("\\iconv")) { $v = iconv("UTF-8", "ASCII//TRANSLIT//IGNORE", $value); if($v) $value = $v; } $needsWork = strlen(str_replace($allowed, '', $value)); } if(strlen($value) > $maxLength) $value = substr($value, 0, $maxLength); if($needsWork) { $value = str_replace(array("'", '"'), '', $value); // blank out any quotes $_value = $value; $filters = FILTER_FLAG_STRIP_LOW | FILTER_FLAG_STRIP_HIGH | FILTER_FLAG_STRIP_BACKTICK; $value = filter_var($value, FILTER_UNSAFE_RAW, $filters); if(!strlen($value)) { // if above filter blanked out the string, try with brackets already replaced $value = str_replace(array('<', '>', '«', '»', '‹', '›'), $replacementChar, $_value); $value = filter_var($value, FILTER_UNSAFE_RAW, $filters); } $hyphenPos = strpos($extras, '-'); if($hyphenPos !== false && $hyphenPos !== 0) { // if hyphen present, ensure it's first (per PCRE requirements) $extras = '-' . str_replace('-', '', $extras); } $chars = $extras . 'a-zA-Z0-9'; $value = preg_replace('{[^' . $chars . ']}', $replacementChar, $value); } // remove leading or trailing dashes, underscores, dots if($beautify) { if($replacementChar !== null && strlen($replacementChar)) { if(strpos($extras, $replacementChar) === false) $extras .= $replacementChar; } $value = trim($value, $extras); } return $value; } /** * Sanitize in "name" format (ASCII alphanumeric letters/digits, hyphens, underscores, periods) * * Default behavior: * * - Allows both upper and lowercase ASCII letters. * - Limits maximum length to 128 characters. * - Replaces non-name format characters with underscore "_". * * ~~~~~ * $test = "Foo+Bar Baz-123" * echo $sanitizer->name($test); // outputs: Foo_Bar_Baz-123 * ~~~~~ * * #pw-group-strings * * @param string $value Value that you want to convert to name format. * @param bool|int $beautify Beautify the returned name? * - Beautify makes returned name prettier by getting rid of doubled punctuation, leading/trailing punctuation and such. * - Should be TRUE when creating a resource using the name for the first time (default is FALSE). * - You may also specify the constant `Sanitizer::translate` (or integer 2) for the this argument, which will make it * translate letters based on name format settings in ProcessWire. * @param int $maxLength Maximum number of characters allowed in the name (default=128). * @param string $replacement Replacement character for invalid characters. Should be either "_", "-" or "." (default="_"). * @param array $options Extra options to replace default 'beautify' behaviors * - `allowAdjacentExtras` (bool): Whether to allow [-_.] characters next to each other (default=false). * - `allowDoubledReplacement` (bool): Whether to allow two of the same replacement chars [-_] next to each other (default=false). * - `allowedExtras (array): Specify extra allowed characters (default=`['-', '_', '.']`). * @return string Sanitized value in name format * @see Sanitizer::pageName() * */ public function name($value, $beautify = false, $maxLength = 128, $replacement = '_', $options = array()) { if(!empty($options['allowedExtras']) && is_array($options['allowedExtras'])) { $allowedExtras = $options['allowedExtras']; $allowedExtrasStr = implode('', $allowedExtras); } else { $allowedExtras = array('-', '_', '.'); $allowedExtrasStr = '-_.'; } $value = $this->nameFilter($value, $allowedExtras, $replacement, $beautify, $maxLength); if($beautify) { $hasExtras = false; foreach($allowedExtras as $c) { $hasExtras = strpos($value, $c) !== false; if($hasExtras) break; } if($hasExtras) { if(empty($options['allowAdjacentExtras'])) { // replace any of '-_.' next to each other with a single $replacement $value = preg_replace('![' . $allowedExtrasStr . ']{2,}!', $replacement, $value); } if(empty($options['allowDoubledReplacement'])) { // replace double'd replacements $r = "$replacement$replacement"; while(strpos($value, $r) !== false) $value = str_replace($r, $replacement, $value); } // replace double dots while(strpos($value, '..') !== false) $value = str_replace('..', '.', $value); } if(strlen($value) > $maxLength) $value = substr($value, 0, $maxLength); } return $value; } /** * Sanitize a string or array containing multiple names * * - Default behavior is to sanitize to ASCII alphanumeric and hyphen, underscore, and period. * - If given a string, multiple names may be separated by a delimeter (which is a space by default). * - Return value will be of the same type as the given value (i.e. string or array). * * #pw-group-strings * * @param string|array $value Value(s) to sanitize to name format. * @param string $delimeter Character that delimits values, if $value is a string (default=" "). * @param array $allowedExtras Additional characters that are allowed in the value (default=['-', '_', '.']). * @param string $replacementChar Single character replacement value for invalid characters (default='_'). * @param bool $beautify Whether or not to beautify returned values (default=false). See Sanitizer::name() for beautify options. * @return string|array Returns string if given a string for $value, returns array if given an array for $value. * */ public function names($value, $delimeter = ' ', $allowedExtras = array('-', '_', '.'), $replacementChar = '_', $beautify = false) { $isArray = false; if(is_array($value)) { $isArray = true; $value = implode(' ', $value); } $replace = array(',', '|', ' '); if($delimeter != ' ' && !in_array($delimeter, $replace)) $replace[] = $delimeter; $value = str_replace($replace, ' ', "$value"); $allowedExtras[] = ' '; $value = $this->nameFilter($value, $allowedExtras, $replacementChar, $beautify, 8192); if($delimeter != ' ') $value = str_replace(' ', $delimeter, $value); while(strpos($value, "$delimeter$delimeter") !== false) { $value = str_replace("$delimeter$delimeter", $delimeter, $value); } $value = trim($value, $delimeter); if($isArray) $value = explode($delimeter, $value); return $value; } /** * Sanitizes a string to be consistent with PHP variable names (not including '$'). * * Allows upper and lowercase ASCII letters, digits and underscore. * * #pw-internal * * @param string $value String you want to sanitize * @return string Sanitized string * */ public function varName($value) { $value = $this->nameFilter($value, array('_'), '_'); if(!ctype_alpha($value)) $value = ltrim($value, $this->digitASCII); // vars cannot begin with numbers return $value; } /** * Sanitize to an ASCII-only HTML attribute name * * #pw-group-strings * * @param string $value * @param int $maxLength * @return string * @since 3.0.133 * */ public function attrName($value, $maxLength = 255) { $value = $this->string($value); $value = trim($value); // force as trimmed string if(ctype_alpha($value) && strlen($value) <= $maxLength) return $value; // simple 1-word attributes // remove any non ":_a-zA-Z" characters from beginning of attribute name while(strlen($value) && strpos(":_$this->alphaASCII", substr($value, 0, 1)) === false) { $value = substr($value, 1); } if(ctype_alnum(str_replace(array('-', '_', ':', '.'), '', $value))) { // names with HTML valid separators if(strlen($value) <= $maxLength) return $value; } // at this point attribute name contains something unusual if(!ctype_graph($value)) { // contains non-visible characters $value = preg_replace('/[\s\r\n\t]+/', '-', $value); if(!ctype_graph($value)) $value = ''; // fail } if($value !== '') { // replace non-word, non-digit, non-punct characters $value = preg_replace('/[^-_.:\w\d]+/', '-', $value); $value = htmlspecialchars($value, ENT_QUOTES, 'UTF-8'); } if($value === 'data-') $value = ''; // data attribute with no name is disallowed if(strlen($value) > $maxLength) { $value = substr($value, 0, $maxLength); } return $value; } /** * Sanitize string to ASCII-only HTML class attribute value * * Note that this does not support all possible characters in an HTML class attribute * and instead focuses on the most commonly used ones. Characters allowed in HTML class * attributes from this method include: `-_:@a-zA-Z0-9`. This method does not allow * values that have no letters or digits. * * @param string $value * @return string * @since 3.0.212 * */ public function htmlClass($value) { $value = trim("$value"); if(empty($value)) return ''; $extras = array('-', '_', ':', '@'); $value = $this->nameFilter($value, $extras, '-'); $value = ltrim($value, '0123456789'); // cannot begin with digit if(trim($value, implode('', $extras)) === '') $value = ''; // do not allow extras-only class return $value; } /** * Sanitize string to ASCII-only space-separated HTML class attribute values with no duplicates * * See additional notes in `Sanitizer::htmlClass()` method. * * @param string|array $value * @param bool $getArray Get array rather than string? (default=false) * @return string|array * @since 3.0.212 * */ public function htmlClasses($value, $getArray = false) { if(is_array($value)) $value = implode(' ', $value); $value = str_replace(array("\n", "\r", "\t", ",", "."), ' ', $value); $value = trim("$value"); if(empty($value)) return $getArray ? array() : ''; $a = array(); foreach(explode(' ', $value) as $c) { $c = $this->htmlClass($c); if(!empty($c)) $a[$c] = $c; } if($getArray) return array_values($a); return count($a) ? implode(' ', $a) : ''; } /** * Sanitize consistent with names used by ProcessWire fields and/or PHP variables * * - Allows upper and lowercase ASCII letters, digits and underscore. * - ProcessWire field names follow the same conventions as PHP variable names, though digits may lead. * - This method is the same as the varName() sanitizer except that it supports beautification and max length. * - Unlike other name formats, hyphen and period are excluded because they aren't allowed characters in PHP variables. * * ~~~~~ * $test = "Hello world"; * echo $sanitizer->fieldName($test); // outputs: Hello_world * ~~~~~ * * #pw-group-strings * * @param string $value Value you want to sanitize * @param bool|int $beautify Should be true when using the name for a new field (default=false). * You may also specify constant `Sanitizer::translate` (or number 2) for the $beautify param, which will make it translate letters * based on the system page name translation settings. * @param int $maxLength Maximum number of characters allowed in the name (default=128). * @return string Sanitized string * */ public function fieldName($value, $beautify = false, $maxLength = 128) { return $this->nameFilter($value, array('_'), '_', $beautify, $maxLength); } /** * Sanitize as a field name but with optional subfield(s) like “field.subfield” * * - Periods must be present to indicate subfield(s), otherwise behaves same as fieldName() sanitizer. * - By default allows just one subfield. To allow more, increase the $limit argument. * - To allow any quantity of subfields, specify -1. * - To reduce a `field.subfield...` combo to just `field` specify 0 for limit argument. * - Maximum length of returned string is (128 + ($limit * 128)). * * ~~~~~~ * echo $sanitizer->fieldSubfield('a.b.c'); // outputs: a.b (default behavior) * echo $sanitizer->fieldSubfield('a.b.c', 2); // outputs: a.b.c * echo $sanitizer->fieldSubfield('a.b.c', 0); // outputs: a * echo $sanitizer->fieldSubfield('a.b.c', -1); // outputs: a.b.c (any quantity) * echo $sanitizer->fieldSubfield('foo bar.baz'); // outputs: foo_bar.baz * echo $sanitizer->fieldSubfield('foo bar baz'); // outputs: foo_bar_baz * ~~~~~~ * * #pw-group-strings * * @param string $value Value to sanitize * @param int $limit Max allowed quantity of subfields, or use -1 for any quantity (default=1). * @return string * @since 3.0.126 * */ public function fieldSubfield($value, $limit = 1) { $value = $this->string($value); if(!strlen($value)) return ''; if(!strpos($value, '.')) return $this->fieldName($value); $parts = array(); foreach(explode('.', trim($value, '.')) as $part) { $part = $this->fieldName($part); if(!strlen($part)) break; $parts[] = $part; if($limit > -1 && count($parts) - 1 >= $limit) break; } $cnt = count($parts); if(!$cnt) return ''; return $cnt === 1 ? $parts[0] : implode('.', $parts); } /** * Name filter as used by ProcessWire Templates * * #pw-internal * * @param string $value * @param bool|int $beautify Should be true when creating a name for the first time. Default is false. * You may also specify Sanitizer::translate (or number 2) for the $beautify param, which will make it translate letters * based on the InputfieldPageName custom config settings. * @param int $maxLength Maximum number of characters allowed in the name * @return string * */ public function templateName($value, $beautify = false, $maxLength = 128) { return $this->nameFilter($value, array('_', '-'), '-', $beautify, $maxLength); } /** * Sanitize as a ProcessWire page name * * - Page names by default support lowercase ASCII letters, digits, underscore, hyphen and period. * * - Because page names are often generated from a UTF-8 title, UTF-8 to ASCII conversion will take place when `$beautify` is enabled. * * - You may optionally omit the `$beautify` and/or `$maxLength` arguments and substitute the `$options` array instead. * * - When substituted, the beautify and maxLength options can be specified in $options as well. * * - If `$config->pageNameCharset` is "UTF8" then non-ASCII page names will be converted to punycode ("xn-") ASCII page names, * rather than converted, regardless of `$beautify` setting. * * ~~~~~ * $test = "Hello world!"; * echo $sanitizer->pageName($test, true); // outputs: hello-world * ~~~~~ * * #pw-group-strings * #pw-group-pages * * @param string $value Value to sanitize as a page name * @param bool|int|array $beautify This argument accepts a few different possible values (default=false): * - `true` (boolean): Make it pretty. Use this when using a pageName for the first time. * - `$options` (array): You can optionally specify the $options array for this argument instead. * - `Sanitizer::translate` (constant): This will make it translate non-ASCII letters based on *InputfieldPageName* module config settings. * - `Sanitizer::toAscii` (constant): Convert UTF-8 characters to punycode ASCII. * - `Sanitizer::toUTF8` (constant): Convert punycode ASCII to UTF-8. * - `Sanitizer::okUTF8` (constant): Allow UTF-8 characters to appear in path (implied if $config->pageNameCharset is 'UTF8'). * @param int|array $maxLength Maximum number of characters allowed in the name. * You may also specify the $options array for this argument instead. * @param array $options Array of options to modify default behavior. See Sanitizer::name() method for available options. * @return string * @see Sanitizer::name() * */ public function pageName($value, $beautify = false, $maxLength = 128, array $options = array()) { $value = $this->string($value); if(!strlen($value)) return ''; $defaults = array( 'charset' => $this->wire()->config->pageNameCharset ); if(is_array($beautify)) { $options = array_merge($beautify, $options); $beautify = isset($options['beautify']) ? $options['beautify'] : false; $maxLength = isset($options['maxLength']) ? $options['maxLength'] : 128; } else if(is_array($maxLength)) { $options = array_merge($maxLength, $options); $maxLength = isset($options['maxLength']) ? $options['maxLength'] : 128; } else { $options = array_merge($defaults, $options); } if($options['charset'] !== 'UTF8' && is_int($beautify) && $beautify > self::translate) { // UTF8 beautify modes aren't available if $config->pageNameCharset is not UTF8 if(in_array($beautify, array(self::toAscii, self::toUTF8, self::okUTF8))) { // if modes aren't supported, disable $beautify = false; } } if($beautify === self::toAscii) { // convert UTF8 to ascii (IDN/punycode) $beautify = false; if(strlen($value) > $maxLength) $value = substr($value, 0, $maxLength); $_value = $value; if(!ctype_alnum($value) && !ctype_alnum(str_replace(array('-', '_', '.'), '', $value)) && strpos($value, 'xn-') !== 0) { do { // encode value $value = $this->punyEncodeName($_value); // if result stayed within our allowed character limit, then good, we're done if(strlen($value) <= $maxLength) break; // continue loop until encoded value is equal or less than allowed max length $_value = substr($_value, 0, strlen($_value) - 1); } while(true); // if encode was necessary and successful, return with no further processing if(strpos($value, 'xn-') === 0) { return $value; } else { // can't be encoded, send to regular name sanitizer $value = $_value; } } } else if($beautify === self::toUTF8) { // convert ascii IDN/punycode to UTF8 $beautify = self::okUTF8; if(strpos($value, 'xn-') === 0) { // found something to convert $value = $this->punyDecodeName($value); // now it will run through okUTF8 } } if($beautify === self::okUTF8) { return $this->pageNameUTF8($value); } return strtolower($this->name($value, $beautify, $maxLength, '-', $options)); } /** * Name filter for ProcessWire Page names with transliteration * * This is the same as calling pageName with the `Sanitizer::translate` option for the `$beautify` argument. * * #pw-group-strings * #pw-group-pages * * @param string $value Value to sanitize * @param int $maxLength Maximum number of characters allowed in the name * @return string Sanitized value * */ public function pageNameTranslate($value, $maxLength = 128) { return $this->pageName($value, self::translate, $maxLength); } /** * Sanitize and allow for UTF-8 characters in page name * * - If `$config->pageNameCharset` is not `UTF8` then this function just passes control to the regular page name sanitizer. * - Allowed UTF-8 characters are determined from `$config->pageNameWhitelist`. * - This method does not convert to or from UTF-8, it only sanitizes it against the whitelist. * - If given a value that has only ASCII characters, this will pass control to the regular page name sanitizer. * * #pw-group-strings * #pw-group-pages * * @param string $value Value to sanitize * @param int $maxLength Maximum number of characters allowed * @return string Sanitized value * */ public function pageNameUTF8($value, $maxLength = 128) { $value = $this->string($value); if(!strlen($value)) return ''; $config = $this->wire()->config; // if UTF8 module is not enabled then delegate this call to regular pageName sanitizer if($config->pageNameCharset != 'UTF8') return $this->pageName($value, false, $maxLength); $tt = $this->getTextTools(); // we don't allow UTF8 page names to be prefixed with "xn-" if(strpos($value, 'xn-') === 0) $value = substr($value, 3); // word separators that we always allow $separators = array('.', '-', '_'); // whitelist of allowed characters and blacklist of disallowed characters $whitelist = $config->pageNameWhitelist; if(!strlen($whitelist)) $whitelist = false; $blacklist = '/\\%"\'<>?#@:;,+=*^$()[]{}|&'; // we let regular pageName handle chars like these, if they appear without other UTF-8 $extras = array('.', '-', '_', ',', ';', ':', '(', ')', '!', '?', '&', '%', '$', '#', '@'); if($whitelist === false || strpos($whitelist, ' ') === false) $extras[] = ' '; // proceed only if value has some non-ascii characters if(ctype_alnum(str_replace($extras, '', $value))) { $k = 'pageNameUTF8.whitelistIsLowercase'; if(!isset($this->caches[$k])) { $this->caches[$k] = $whitelist !== false && $tt->strtolower($whitelist) === $whitelist; } if($this->caches[$k] || $tt->strtolower($value) === $value) { // whitelist supports only lowercase OR value is all lowercase // let regular pageName sanitizer handle this return $this->pageName($value, false, $maxLength); } } // validate that all characters are in our whitelist $replacements = array(); for($n = 0; $n < $tt->strlen($value); $n++) { $c = $tt->substr($value, $n, 1); $inBlacklist = $tt->strpos($blacklist, $c) !== false || strpos($blacklist, $c) !== false; $inWhitelist = !$inBlacklist && $whitelist !== false && $tt->strpos($whitelist, $c) !== false; if($inWhitelist && !$inBlacklist) { // in whitelist } else if($inBlacklist || !strlen(trim($c)) || ctype_cntrl($c)) { // character does not resolve to something visible or is in blacklist $replacements[] = $c; } else if($whitelist === false) { // whitelist disabled: allow everything that is not blacklisted } else { // character that is not in whitelist, double check case variants $cLower = $tt->strtolower($c); $cUpper = $tt->strtoupper($c); if($cLower !== $c && $tt->strpos($whitelist, $cLower) !== false) { // allow character and convert to lowercase variant $value = $tt->substr($value, 0, $n) . $cLower . $tt->substr($value, $n+1); } else if($cUpper !== $c && $tt->strpos($whitelist, $cUpper) !== false) { // allow character and convert to uppercase varient $value = $tt->substr($value, 0, $n) . $cUpper . $tt->substr($value, $n+1); } else { // queue character to be replaced $replacements[] = $c; } } } // replace disallowed characters with "-" if(count($replacements)) $value = str_replace($replacements, '-', $value); // replace doubled word separators foreach($separators as $c) { while(strpos($value, "$c$c") !== false) { $value = str_replace("$c$c", $c, $value); } } // trim off any remaining separators/extras $value = trim($value, '-_.'); if($tt->strlen($value) > $maxLength) $value = $tt->substr($value, 0, $maxLength); return $value; } /** * Decode a PW-punycode'd name value * * @param string $value * @return string * */ protected function punyDecodeName($value) { // exclude values that we know can't be converted if(strlen($value) < 4 || strpos($value, 'xn-') !== 0) return $value; if(strpos($value, '__')) { $_value = $value; $parts = explode('__', $_value); foreach($parts as $n => $part) { $parts[$n] = $this->punyDecodeName($part); } $value = implode('', $parts); return $value; } $_value = $value; // convert "xn-" single hyphen to recognized punycode "xn--" double hyphen if(strpos($value, 'xn--') !== 0) $value = 'xn--' . substr($value, 3); if(function_exists('idn_to_utf8')) { // use native php function if available $value = @idn_to_utf8($value); } else { // otherwise use Punycode class $pc = new Punycode(); $value = $pc->decode($value); } // if utf8 conversion failed, restore original value if($value === false || !strlen($value)) $value = $_value; return $value; } /** * Encode a name value to PW-punycode * * @param string $value * @return string * */ protected function punyEncodeName($value) { // exclude values that don't need to be converted if(strpos($value, 'xn-') === 0) return $value; if(ctype_alnum(str_replace(array('.', '-', '_'), '', $value))) return $value; $tt = $this->getTextTools(); while(strpos($value, '__') !== false) { $value = str_replace('__', '_', $value); } if(strlen($value) >= 50) { $_value = $value; $parts = array(); while(strlen($_value)) { $part = $tt->substr($_value, 0, 12); $_value = $tt->substr($_value, 12); $parts[] = $this->punyEncodeName($part); } $value = implode('__', $parts); return $value; } $_value = $value; if(function_exists("idn_to_ascii")) { // use native php function if available $value = substr(@idn_to_ascii($value), 3); } else { // otherwise use Punycode class $pc = new Punycode(); $value = substr($pc->encode($value), 3); } if(strlen($value) && $value !== '-') { // in PW the xn- prefix has one fewer hyphen than in native Punycode // for compatibility with pageName sanitization and beautification $value = "xn-$value"; } else { // fallback to regular 'name' sanitization on failure, ensuring that // return value is always ascii $value = $this->name($_value); } return $value; } /** * Format required by ProcessWire user names * * #pw-internal * * @deprecated, use pageName instead. * @param string $value * @return string * */ public function username($value) { return $this->pageName($value); } /** * Name filter for ProcessWire filenames (basenames only, not paths) * * This sanitizes a filename to be consistent with the name format in ProcessWire, * ASCII-alphanumeric, hyphens, underscores and periods. * * #pw-group-strings * #pw-group-files * * @param string $value Filename to sanitize * @param bool|int $beautify Should be true when creating a file's name for the first time. Default is false. * You may also specify Sanitizer::translate (or number 2) for the $beautify param, which will make it translate letters * based on the InputfieldPageName custom config settings. * @param int $maxLength Maximum number of characters allowed in the filename * @return string Sanitized filename * */ public function filename($value, $beautify = false, $maxLength = 128) { if(!is_string($value)) return ''; $value = basename($value); if(strlen($value) > $maxLength) { // truncate, while keeping extension in tact $pathinfo = pathinfo($value); $extLen = strlen($pathinfo['extension']) + 1; // +1 includes period $basename = substr($pathinfo['filename'], 0, $maxLength - $extLen); $value = "$basename.$pathinfo[extension]"; } $value = $this->name($value, $beautify, $maxLength, '_', array( 'allowAdjacentExtras' => true, // language translation filenames require doubled "--" chars, others may too )); while(strpos($value, '..') !== false) $value = str_replace('..', '', $value); return $value; } /** * Hookable alias of filename method for case consistency with other name methods (preferable to use filename) * * #pw-internal * * @param string $value * @param bool|int $beautify Should be true when creating a file's name for the first time. Default is false. * You may also specify Sanitizer::translate (or number 2) for the $beautify param, which will make it translate letters * based on the InputfieldPageName custom config settings. * @param int $maxLength Maximum number of characters allowed in the name * @return string * */ public function ___fileName($value, $beautify = false, $maxLength = 128) { return $this->filename($value, $beautify, $maxLength); } /** * Validate the given path, return path if valid, or false if not valid * * Returns the given path if valid, or boolean false if not. * * Path is validated per ProcessWire "name" convention of ascii only [-_./a-z0-9] * As a result, this function is primarily useful for validating ProcessWire paths, * and won't always work with paths outside ProcessWire. * * This method validates only and does not sanitize. See `$sanitizer->pagePathName()` for a similar * method that does sanitiation. * * #pw-group-strings * #pw-group-pages * * @param string $value Path to validate * @param int|array $options Options to modify behavior, or maxLength (int) may be specified. * - `allowDotDot` (bool): Whether to allow ".." in a path (default=false) * - `maxLength` (int): Maximum length of allowed path (default=1024) * @return bool|string Returns false if invalid, actual path (string) if valid. * @see Sanitizer::pagePathName() * */ public function path($value, $options = array()) { if(!is_string($value)) return false; if(is_int($options)) $options = array('maxLength' => $options); $defaults = array( 'allowDotDot' => false, 'maxLength' => 1024 ); $options = array_merge($defaults, $options); if(DIRECTORY_SEPARATOR != '/') $value = str_replace(DIRECTORY_SEPARATOR, '/', $value); if(strlen($value) > $options['maxLength']) return false; if(strpos($value, '/./') !== false || strpos($value, '//') !== false) return false; if(!$options['allowDotDot'] && strpos($value, '..') !== false) return false; if(!preg_match('{^[-_./a-z0-9]+$}iD', $value)) return false; return $value; } /** * Sanitize a page path name * * Returned path is not guaranteed to be valid or match a page, just sanitized. * * #pw-group-strings * #pw-group-pages * * @param string $value Value to sanitize * @param bool|int $beautify Beautify the value? (default=false). Maybe any of the following: * - `true` (bool): Beautify the individual page names in the path to remove redundant and trailing punctuation and more. * - `false` (bool): Do not perform any conversion or attempt to make it more pretty, just sanitize (default). * - `Sanitizer::translate` (constant): Translate UTF-8 characters to visually similar ASCII (using InputfieldPageName module settings). * - `Sanitizer::toAscii` (constant): Convert UTF-8 characters to punycode ASCII. * - `Sanitizer::toUTF8` (constant): Convert punycode ASCII to UTF-8. * - `Sanitizer::okUTF8` (constant): Allow UTF-8 characters to appear in path (implied if $config->pageNameCharset is 'UTF8'). * @param int $maxLength Maximum length (default=2048) * @return string Sanitized path name * */ public function pagePathName($value, $beautify = false, $maxLength = 2048) { $value = $this->string($value); if(!strlen($value)) return ''; $extras = array('/', '-', '_', '.'); $utf8 = $this->wire()->config->pageNameCharset === 'UTF8'; if($beautify === self::toAscii && $utf8) { // convert UTF8 to punycode when applicable if(ctype_alnum(str_replace($extras, '', $value))) { // value needs no ascii conversion } else { // convert UTF8 to ascii value $parts = explode('/', $value); foreach($parts as $n => $part) { if(!strlen($part) || ctype_alnum($part)) continue; $b = (ctype_alnum(str_replace($extras, '', $part)) ? false : self::toAscii); $parts[$n] = $this->pageName($part, $b, $maxLength); } $value = implode('/', $parts); } } else if($beautify === self::okUTF8 && $utf8) { // UTF8 path $value = $this->pagePathNameUTF8($value); } else if($beautify === self::toUTF8 && $utf8 && strpos($value, 'xn-') !== false) { // ASCII to UTF8 conversion, when requested $parts = explode('/', $value); foreach($parts as $n => $part) { if(!strlen($part)) continue; $b = strpos($part, 'xn-') === 0 ? self::toUTF8 : false; $parts[$n] = $this->pageName($part, $b, $maxLength); } $value = implode('/', $parts); $value = $this->pagePathNameUTF8($value); } else { // ASCII path standard $b = $beautify; if($b === self::okUTF8 || $b === self::toUTF8 || $b === self::toAscii) $b = false; $parts = explode('/', $value); foreach($parts as $n => $part) { if(!strlen($part)) continue; $parts[$n] = $this->pageName($part, $b, $maxLength); } $value = implode('/', $parts); } // no double-slash, double-dot or slash-dot $reps = array('//' => '/', '..' => '.', '/.' => '/'); foreach($reps as $find => $replace) { while(strpos($value, $find) !== false) { $value = str_replace(array_keys($reps), array_values($reps), $value); } } // truncate if needed if($maxLength && strlen($value) > $maxLength) { $slash = substr($value, -1) === '/'; $value = substr($value, 0, $maxLength); $pos = strrpos($value, '/'); if($pos) $value = substr($value, 0, $pos); if($slash) $value = rtrim($value, '/') . '/'; } return $value; } /** * Sanitize a UTF-8 page path name (does not perform ASCII/UTF8 conversions) * * - If `$config->pageNameCharset` is not `UTF8` then this does the same thing as `$sanitizer->pagePathName()`. * - Returned path is not guaranteed to be valid or match a page, just sanitized. * * #pw-group-strings * #pw-group-pages * * @param string $value Path name to sanitize * @return string * @see Sanitizer::pagePathName() * */ public function pagePathNameUTF8($value) { if($this->wire()->config->pageNameCharset !== 'UTF8') return $this->pagePathName($value); $value = $this->string($value); if(!strlen($value)) return ''; $parts = explode('/', $value); foreach($parts as $n => $part) { $parts[$n] = $this->pageName($part, self::okUTF8); } $value = implode('/', $parts); $disallow = array('..', '/.', './', '//'); foreach($disallow as $x) { while(strpos($value, $x) !== false) { $value = str_replace($disallow, '', $value); } } return $value; } /** * Sanitize to ASCII alpha (a-z A-Z) * * #pw-group-strings * * @param string $value Value to sanitize * @param bool|int $beautify Whether to beautify (See Sanitizer::translate option too) * @param int $maxLength Maximum length of returned value (default=1024) * @return string * */ public function alpha($value, $beautify = false, $maxLength = 1024) { $value = $this->alphanumeric($value, $beautify, $maxLength * 10); if(!ctype_alpha($value)) { $value = str_replace(str_split($this->digitASCII), '', $value); if(!ctype_alpha($value)) $value = preg_replace('/[^a-zA-Z]+/', '', $value); } if(strlen($value) > $maxLength) $value = substr($value, 0, $maxLength); return $value; } /** * Sanitize to ASCII alphanumeric (a-z A-Z 0-9) * * #pw-group-strings * * @param string $value Value to sanitize * @param bool|int $beautify Whether to beautify (See Sanitizer::translate option too) * @param int $maxLength Maximum length of returned value (default=1024) * @return string * */ public function alphanumeric($value, $beautify = false, $maxLength = 1024) { $value = $this->nameFilter($value, array('_'), '_', $beautify, $maxLength * 10); $value = str_replace('_', '', $value); if(strlen($value) > $maxLength) $value = substr($value, 0, $maxLength); return $value; } /** * Sanitize string to contain only ASCII digits (0-9) * * #pw-group-strings * #pw-group-numbers * * @param string $value Value to sanitize * @param int $maxLength Maximum length of returned value (default=1024) * @return string * */ public function digits($value, $maxLength = 1024) { $value = $this->nameFilter($value, array('_'), '_', false, $maxLength * 10); if(!ctype_digit($value)) { $value = str_replace(str_split('_' . $this->alphaASCII), '', $value); if(!ctype_digit($value)) $value = preg_replace('/[^\d]+/', '', $value); } if(strlen($value) > $maxLength) $value = substr($value, 0, $maxLength); return $value; } /** * Sanitize and validate an email address * * Returns valid email address, or blank string if it isn’t valid. * * #pw-group-strings * #pw-group-validate * * @param string $value Email address to sanitize and validate. * @param array $options All options require 3.0.208+ * - `allowIDN` (bool|int): Allow internationalized domain names? (default=false) * Specify int 2 to also allow UTF-8 in local-part of email [SMTPUTF8] (i.e. `bøb`). * - `getASCII` (bool): Returns ASCII encoded version of email when host is IDN (default=false) * Does not require the allowIDN option since returned email host will be only ASCII. * Not meant to be combined with allowIDN=2 option since local-part of email does not ASCII encode. * - `getUTF8` (bool): Converts ASCII-encoded IDNs to UTF-8, when present (default=false) * - `checkDNS` (bool): Check that host part of email has a valid DNS record? (default=false) * Warning: this slows things down a lot and should not be used in time sensitive cases. * - `throw` (bool): Throw WireException on fail with details on why it failed (default=false) * @return string Sanitized, valid email address, or blank string on failure. * */ public function email($value, array $options = array()) { if(empty($value)) return ''; $defaults = array( 'allowIDN' => false, 'getASCII' => false, 'getUTF8' => false, 'checkDNS' => false, 'throw' => false, '_debug' => false, ); $options = array_merge($defaults, $options); $debug = $options['_debug']; if($options['throw']) { unset($options['throw']); $value = $this->email($value, array_merge($options, array('_debug' => true))); if(!strpos($value, '@')) throw new WireException($value); return $value; } if($options['checkDNS']) { unset($options['checkDNS']); $valueASCII = $this->email($value, array_merge($options, array('getASCII' => true))); if(strpos($valueASCII, '@') === false) return $valueASCII; // fail list(,$host) = explode('@', $value, 2); $dns = dns_get_record($host, DNS_MX | DNS_A | DNS_CNAME | DNS_AAAA); if(empty($dns)) return ($debug ? 'Failed DNS check' : ''); if($options['getASCII']) return $valueASCII; return $this->email($value, $options); } $value = trim(trim((string) $value), '.@'); if(!strlen($value)) return ($debug ? 'Trimmed value is empty' : ''); if(!strpos($value, '@')) return ($debug ? 'Missing at symbol' : ''); if(strpos($value, ' ')) $value = str_replace(' ', '', $value); if($options['getUTF8'] && strpos($value, 'xn-') !== false && function_exists('\idn_to_utf8')) { list($addr, $host) = explode('@', $value, 2); if(strpos($host, 'xn-') !== false) { $host = idn_to_utf8($host); if($host !== false) $value = "$addr@$host"; } } if(filter_var($value, FILTER_VALIDATE_EMAIL)) return $value; // valid $pos = strpos($value, '<'); if($pos !== false && strpos($value, '>') > $pos+3) { // John Smith => jsmith@domain.com list(,$value) = explode('<', $value, 2); list($value,) = explode('>', $value, 2); return $this->email($value, $options); } // all following code for processing IDN emails if(!$options['allowIDN'] && !$options['getASCII']) return ($debug ? 'Invalid+allowIDN/getASCII=0' : ''); if(preg_match('/^[-@_.a-z0-9]+$/i', $value)) return ($debug ? 'Invalid and not IDN' : ''); $parts = explode('@', $value); if(count($parts) !== 2) return ($debug ? 'More than one at symbol' : ''); $tt = $this->getTextTools(); list($addr, $host) = $parts; if($tt->strlen($addr) > 64) return ($debug ? 'Local part exceeds 64 max length' : ''); if($tt->strlen($host) > 255) return ($debug ? 'Host part exceeds 255 max length' : ''); if(function_exists('\idn_to_ascii')) { // if email doesn't survive IDN conversions then not valid $email = $value; $hostASCII = idn_to_ascii($host); if($hostASCII === false) return ($debug ? 'Fail UTF8-to-ASCII' : ''); $test = ($options['allowIDN'] === 2 ? 'bob' : $addr) . "@$hostASCII"; if(!filter_var($test, FILTER_VALIDATE_EMAIL)) return ($debug ? 'Fail validate post IDN-to-ASCII' : ''); $hostUTF8 = idn_to_utf8($hostASCII); if($hostUTF8 === false) return ($debug ? 'Fail IDN-to-UTF8 conversion' : ''); $value = "$addr@$hostUTF8"; if($email !== $value) return ($debug ? 'Modified by IDN conversion' : ''); if($options['getASCII']) return "$addr@$hostASCII"; } else if($options['getASCII']) { return ($debug ? 'getASCII requested and idn_to_ascii not available' : ''); } $regex = // regex adapted from Validators::isEmail() in https://github.com/nette/utils/ '@^' . '("([ !#-[\]-~]*|\\\[ -~])+"|LOCAL+(\.LOCAL+)*)\@' . // local-part '([\dALPHA]([-\dALPHA]{0,61}[\dALPHA])?\.)+' . // domain '[ALPHA]([-\dALPHA]{0,17}[ALPHA])?' . // TLD '$@Di'; $local = "-a-z\d!#$%&'*+/=?^_`{|}~" . ($options['allowIDN'] === 2 ? "\x80-\xFF" : ''); $regex = str_replace('LOCAL', "[$local]", $regex); // // RFC5322 unquoted characters $regex = str_replace('ALPHA', "a-z\x80-\xFF", $regex); // superset of IDN if(!preg_match($regex, $value)) return ($debug ? 'Fail IDN regex' : ''); return $value; } /** * Returns a value that may be used in an email header * * This method is designed to prevent one email header from injecting into another. * * #pw-group-strings * * @param string $value * @param bool $headerName Sanitize a header name rather than header value? (default=false) Since 3.0.132 * @return string * */ public function emailHeader($value, $headerName = false) { if(!is_string($value)) return ''; $a = array("\n", "\r", "", "", "0x0A", "0x0D", "%0A", "%0D"); // newlines $value = trim(str_ireplace($a, ' ', stripslashes($value))); if($headerName) $value = trim(preg_replace('/[^-_a-zA-Z0-9]/', '-', trim($value, ':')), '-'); return $value; } /** * Return first word in given string * * #pw-group-strings * * @param string $value String containing one or more words * @param array $options Options to adjust behavior: * - `keepNumbers` (bool): Allow numbers as return value? (default=true) * - `keepNumberFormat` (bool): Keep minus/comma/period in numbers rather than splitting into words? Also requires keepNumbers==true. (default=false) * - `keepUnderscore` (bool): Keep underscores as part of words? (default=false) * - `keepHyphen` (bool): Keep hyphenated words? (default=false) * - `keepChars` (array): Specify any of these to also keep as part of words ['.', ',', ';', '/', '*', ':', '+', '<', '>', '_', '-' ] (default=[]) * - `minWordLength` (int): Minimum word length (default=1) * - `maxWordLength` (int): Maximum word length (default=80) * - `maxWords` (int): Maximum words (default=1 or 99 if a seperator option is specified) * - `maxLength` (int): Maximum returned string length (default=1024) * - `stripTags` (bool): Strip markup tags so they don’t contribute to returned word? (default=true) * - `separator' (string): Merge multiple words into one word split by this character? (default='', disabled) 3.0.195+ * - `ascii` (bool): Allow only ASCII word characters? (default=false) * - `beautify` (bool): Make ugly strings more pretty? This collapses and trims redundant separators (default=false) * @return string * @see Sanitizer::wordsArray() * @since 3.0.162 * */ public function word($value, array $options = array()) { if(!is_string($value)) $value = $this->string($value); $separator = isset($options['separator']) ? $options['separator'] : null; $keepChars = isset($options['keepChars']) ? $options['keepChars'] : array(); $maxLength = isset($options['maxLength']) ? (int) $options['maxLength'] : 1024; $minWordLength = isset($options['minWordLength']) ? $options['minWordLength'] : 1; if(empty($options['maxWords'])) $options['maxWords'] = $separator !== null ? 99 : 1; if(!empty($options['keepHyphen']) && !in_array('-', $keepChars)) $keepChars[] = '-'; if(!empty($options['keepUnderscore']) && !in_array('_', $keepChars)) $keepChars[] = '_'; $options['keepChars'] = $keepChars; $a = $this->wordsArray($value, $options); $count = count($a); if(!$count) return ''; if($separator !== null && $count > 1) { $value = implode($separator, $a); } else { $value = reset($a); } if(!empty($options['ascii'])) { $sep = $separator === null ? '' : $separator; $value = $this->nameFilter($value, $keepChars, $sep, Sanitizer::translate, $maxLength); } else if($maxLength) { $length = $this->multibyteSupport ? mb_strlen($value) : strlen($value); if($length > $maxLength) { $value = $this->multibyteSupport ? mb_substr($value, 0, $maxLength) : substr($value, 0, $maxLength); } } if(!empty($options['beautify'])) { foreach($keepChars as $s) { while(strpos($value, "$s$s") !== false) $value = str_replace("$s$s", $s, $value); } $value = trim($value, implode('', $keepChars)); } if($minWordLength > 1 && strlen($value) < $minWordLength) $value = ''; return $value; } /** * Given string return a new string containing only words * * #pw-group-strings * * @param $value * @param array $options * - `separator` (string): String to use to separate words (default=' ') * - `ascii` (string): Only allow ASCII characters in words? (default=false) * - `keepUnderscore` (bool): Keep underscores as part of words? (default=false) * - `keepHyphen` (bool): Keep hyphenated words? (default=false) * - `keepChars` (array): Additional non word characters to keep (default=[]) * - `maxWordLength` (int): Maximum word length (default=80) * - `minWordLength` (int): Minimum word length (default=1) * - `maxLength` (int): Maximum return value length (default=1024) * - `beautify` (bool): Make ugly strings more pretty? This collapses and trims redundant separators (default=true) * @since 3.0.195 * @return string * */ public function words($value, array $options = array()) { $defaults = array( 'ascii' => false, 'separator' => ' ', 'keepHyphen' => true, 'keepUnderscore' => true, 'keepChars' => array(), 'maxWordLength' => 255, 'maxLength' => 1024, 'beautify' => true, ); $options = array_merge($defaults, $options); $value = $this->word($value, $options); return $value; } /** * Sanitize short string of text to single line without HTML * * - This sanitizer is useful for short strings of input text like like first and last names, street names, search queries, etc. * * - Please note the default 255 character max length setting. * * - If using returned value for front-end output, be sure to run it through `$sanitizer->entities()` first. * * ~~~~~ * $str = " * Hello World * How are you doing today? * "; * * echo $sanitizer->text($str); * // outputs: Hello World How are you doing today? * ~~~~~ * * #pw-group-strings * * @param string $value String value to sanitize * @param array $options Options to modify default behavior: * - `maxLength` (int): maximum characters allowed, or 0=no max (default=255). * - `maxBytes` (int): maximum bytes allowed (default=0, which implies maxLength*4). * - `stripTags` (bool): strip markup tags? (default=true). * - `stripMB4` (bool): strip emoji and other 4-byte UTF-8? (default=false). * - `stripQuotes` (bool): strip out any "quote" or 'quote' characters? Specify true, or character to replace with. (default=false) * - `stripSpace` (bool|string): strip whitespace? Specify true or character to replace whitespace with (default=false). Since 3.0.105 * - `reduceSpace` (bool|string): reduce consecutive whitespace to single? Specify true or character to reduce to (default=false). * Note that the reduceSpace option is an alternative to the stripSpace option, they should not be used together. Since 3.0.105 * - `allowableTags` (string): markup tags that are allowed, if stripTags is true (use same format as for PHP's `strip_tags()` function. * - `multiLine` (bool): allow multiple lines? if false, then $newlineReplacement below is applicable (default=false). * - `convertEntities` (bool): convert HTML entities to equivalent character(s)? (default=false). Since 3.0.105 * - `newlineReplacement` (string): character to replace newlines with, OR specify boolean true to remove extra lines (default=" "). * - `truncateTail` (bool): if truncate necessary for maxLength, truncate from end/tail? Use false to truncate head (default=true). Since 3.0.105 * - `inCharset` (string): input character set (default="UTF-8"). * - `outCharset` (string): output character set (default="UTF-8"). * @return string * @see Sanitizer::textarea(), Sanitizer::line() * */ public function text($value, $options = array()) { $defaultOptions = array( 'maxLength' => 255, // maximum characters allowed, or 0=no max 'maxBytes' => 0, // maximum bytes allowed (0 = default, which is maxLength*4) 'stripTags' => true, // strip markup tags 'stripMB4' => false, // strip Emoji and 4-byte characters? 'stripQuotes' => false, // strip quote characters? Specify true, or character to replace them with 'stripSpace' => false, // remove/replace whitespace? If yes, specify character to replace with, or true for blank 'reduceSpace' => false, // reduce whitespace to single? If yes, specify character to replace with or true for ' '. 'allowableTags' => '', // tags that are allowed, if stripTags is true (use same format as for PHP's strip_tags function) 'multiLine' => false, // allow multiple lines? if false, then $newlineReplacement below is applicable 'convertEntities' => false, // convert HTML entities to equivalent characters? 'newlineReplacement' => ' ', // character to replace newlines with, OR specify boolean TRUE to remove extra lines 'inCharset' => 'UTF-8', // input charset 'outCharset' => 'UTF-8', // output charset 'truncateTail' => true, // if truncate necessary for maxLength, remove chars from tail? False to truncate from head. 'trim' => true, // trim whitespace from beginning/end, or specify character(s) to trim, or false to disable ); static $alwaysReplace = null; $truncated = false; $options = array_merge($defaultOptions, $options); if(isset($options['multiline'])) $options['multiLine'] = $options['multiline']; // common case error if(isset($options['maxlength'])) $options['maxLength'] = $options['maxlength']; // common case error if($options['maxLength'] < 0) $options['maxLength'] = 0; if($options['maxBytes'] < 0) $options['maxBytes'] = 0; if($alwaysReplace === null) { $alwaysReplace = array( html_entity_decode('
', ENT_QUOTES, 'UTF-8') => '', // line-seperator that is sometimes copy/pasted ); } if($options['reduceSpace'] !== false && $options['stripSpace'] === false) { // if reduceSpace option is used then provide necessary value for stripSpace option $options['stripSpace'] = is_string($options['reduceSpace']) ? $options['reduceSpace'] : ' '; } if(!is_string($value)) $value = $this->string($value); if(!$options['multiLine']) { if(strpos($value, "\r") !== false) { $value = str_replace("\r", "\n", $value); // normalize to LF } $pos = strpos($value, "\n"); if($pos !== false) { if($options['newlineReplacement'] === true) { // remove extra lines $value = rtrim(substr($value, 0, $pos)); } else { // remove linefeeds $value = str_replace(array("\n\n", "\n"), $options['newlineReplacement'], $value); } } } if($options['stripTags']) { $value = strip_tags($value, $options['allowableTags']); } if($options['inCharset'] != $options['outCharset']) { $value = iconv($options['inCharset'], $options['outCharset'], $value); } if($options['convertEntities']) { $value = $this->unentities($value, true, $options['outCharset']); } foreach($alwaysReplace as $find => $replace) { if(strpos($value, $find) === false) continue; $value = str_replace($find, $replace, $value); } if($options['stripSpace'] !== false) { $c = is_string($options['stripSpace']) ? $options['stripSpace'] : ''; $allow = $options['multiLine'] ? array("\n") : array(); $value = $this->removeWhitespace($value, array('replace' => $c, 'allow' => $allow)); } if($options['stripMB4']) { $value = $this->removeMB4($value); } if($options['stripQuotes']) { $value = str_replace(array('"', "'"), (is_string($options['stripQuotes']) ? $options['stripQuotes'] : ''), $value); } if($options['trim']) { $value = is_string($options['trim']) ? trim($value, $options['trim']) : trim($value); } if($options['maxLength']) { if(empty($options['maxBytes'])) $options['maxBytes'] = $options['maxLength'] * 4; if($this->multibyteSupport) { if(mb_strlen($value, $options['outCharset']) > $options['maxLength']) { $truncated = true; if($options['truncateTail']) { $value = mb_substr($value, 0, $options['maxLength'], $options['outCharset']); } else { $value = mb_substr($value, -1 * $options['maxLength'], null, $options['outCharset']); } } } else { if(strlen($value) > $options['maxLength']) { $truncated = true; if($options['truncateTail']) { $value = substr($value, 0, $options['maxLength']); } else { $value = substr($value, -1 * $options['maxLength']); } } } } if($options['maxBytes']) { $n = $options['maxBytes']; while(strlen($value) > $options['maxBytes']) { $truncated = true; $n--; if($this->multibyteSupport) { if($options['truncateTail']) { $value = mb_substr($value, 0, $n, $options['outCharset']); } else { $value = mb_substr($value, $n, null, $options['outCharset']); } } else { if($options['truncateTail']) { $value = substr($value, 0, $n); } else { $value = substr($value, $n); } } } } if($truncated && $options['trim']) { // secondary trim after truncation $value = is_string($options['trim']) ? trim($value, $options['trim']) : trim($value); } return $value; } /** * Sanitize input string as multi-line text without HTML tags * * - This sanitizer is useful for user-submitted text from a plain-text `