artabro/wire/core/DatabaseQuerySelectFulltext.php

1432 lines
42 KiB
PHP
Raw Normal View History

2024-08-27 11:35:37 +02:00
<?php namespace ProcessWire;
/**
* ProcessWire DatabaseQuerySelectFulltext
*
* A wrapper for SELECT SQL queries using FULLTEXT indexes
*
* Decorates a DatabaseQuerySelect object by providing the WHERE and
* ORDER parts for a fulltext query based on the table, field, operator
* and value you are searching.
*
* Assumes that you are providing at least the SELECT and FROM portions
* of the query.
*
* The intention behind these classes is to have a query that can safely
* be passed between methods and objects that add to it without knowledge
* of what other methods/objects have done to it. It also means being able
* to build a complex query without worrying about correct syntax placement.
*
* This file is licensed under the MIT license
* https://processwire.com/about/license/mit/
*
* ProcessWire 3.x, Copyright 2022 by Ryan Cramer
* https://processwire.com
*
* @property-read $tableField
*
*
*
*/
class DatabaseQuerySelectFulltext extends Wire {
/**
* Max length that we allow for a query
*
*/
const maxQueryValueLength = 500;
/**
* @var DatabaseQuerySelect|PageFinderDatabaseQuerySelect
*
*/
protected $query;
/**
* @var string
*
*/
protected $tableName = '';
/**
* Current field/column name
*
* @var $fieldName
*
*/
protected $fieldName = '';
/**
* All field/column names (if more than one)
*
* @var array
*
*/
protected $fieldNames = array();
/**
* @var string
*
*/
protected $operator = '';
/**
* @var string
*
*/
protected $method = '';
/**
* Is it a NOT operator?
*
* This is not used by PageFinder originating queries, which handles NOT internally.
*
* @var bool
*
*/
protected $not = false;
/**
* Cached minimum word length
*
* @var int|null
*
*/
protected $minWordLength = null;
/**
* Allow adding 'ORDER BY' to query?
*
* @var bool|null
*
*/
protected $allowOrder = null;
/**
* Allow fulltext searches to fallback to LIKE searches to match stopwords?
*
* @var bool
*
*/
protected $allowStopwords = true;
/**
* @var array
*
*/
static protected $scoreCnts = array();
/**
* Method names to operators they handle
*
* @var array
*
*/
protected $methodOperators = array(
'matchEquals' => array('=', '!=', '>', '<', '>=', '<='),
'matchPhrase' => array('*='),
'matchPhraseExpand' => array('*+='),
'matchRegular' => array('**=', '**+='),
'matchStartEnd' => array('^=', '$='),
'matchWords' => array('~=', '~+=', '~*=', '~~=', '~|=', '~|*=', '~|+='),
'matchLikeWords' => array('~%=', '~|%='),
'matchLikePhrase' => array('%='),
'matchLikeStartEnd' => array('%^=', '%$='),
'matchCommands' => array('#='),
);
/**
* Alternate operators to substitute when LIKE match is forced due to no FULLTEXT index
*
* @var array of operator to replacement operator
*
*/
protected $likeAlternateOperators = array(
'*=' => '%=',
'^=' => '%^=',
'$=' => '%$=',
'~=' => '~%=',
'~|=' => '~|%=',
);
/**
* Force use of LIKE?
*
* @var bool
*
*/
protected $forceLike = false;
/**
* Construct
*
* @param DatabaseQuerySelect|PageFinderDatabaseQuerySelect $query
*
*/
public function __construct(DatabaseQuerySelect $query) {
parent::__construct();
$query->wire($this);
$this->query = $query;
}
/**
* @param string $name
* @return mixed|string
*
*/
public function __get($name) {
if($name === 'tableField') return $this->tableField();
return parent::__get($name);
}
/**
* Get the query that was provided to the constructor
*
* @return DatabaseQuerySelect
*
*/
public function getQuery() {
return $this->query;
}
/**
* Get 'tableName.fieldName' string
*
* @return string
*
*/
protected function tableField() {
$fieldName = $this->fieldName;
if(!$fieldName) $fieldName = 'data';
return "$this->tableName.$fieldName";
}
/**
* Get or set whether or not 'ORDER BY' statements are allowed to be added
*
* @param null|bool $allow Specify bool to set or omit to get
* @return bool|null Returns bool when known or null when not yet known
* @since 3.0.162
*
*/
public function allowOrder($allow = null) {
if($allow !== null) $this->allowOrder = $allow ? true : false;
return $this->allowOrder;
}
/**
* Get or set whether fulltext searches can fallback to LIKE searches to match stopwords
*
* @param null|bool $allow Specify bool to set or omit to get
* @return bool
* @since 3.0.162
*
*/
public function allowStopwords($allow = null) {
if($allow !== null) $this->allowStopwords = $allow ? true : false;
return $this->allowStopwords;
}
/**
* @return string
*
*/
protected function matchType() {
return "\n " . ($this->not ? 'NOT MATCH' : 'MATCH');
}
/**
* Escape string for use in a MySQL LIKE
*
* When applicable, $database->escapeStr() should be applied before this.
*
* @param string $str
* @return string
*
*/
protected function escapeLike($str) {
return str_replace(array('%', '_'), array('\\%', '\\_'), $str);
}
/**
* Additional escape for use in a MySQL AGAINST
*
* When applicable, $database->escapeStr() must also be applied (before or after).
*
* @param string $str
* @return string
*
*/
protected function escapeAgainst($str) {
$str = str_replace(array('@', '+', '-', '*', '~', '<', '>', '(', ')', ':', '"', '&', '|', '=', '.'), ' ', $str);
while(strpos($str, ' ')) $str = str_replace(' ', ' ', $str);
return $str;
}
/**
* @param string $value
* @return string
*
*/
protected function value($value) {
$maxLength = self::maxQueryValueLength;
$value = trim($value);
if(strlen($value) < $maxLength && strpos($value, "\n") === false && strpos($value, "\r") === false) return $value;
$value = $this->sanitizer->trunc($value, $maxLength);
return $value;
}
/**
* Update the query (provided to the constructor) to match the given arguments
*
* @param string $tableName
* @param string $fieldName
* @param string $operator
* @param string|int|array $value Value to match. Array value support added 3.0.141 (not used by PageFinder)
* @return $this
* @throws WireException If given $operator argument is not implemented here
*
*/
public function match($tableName, $fieldName, $operator, $value) {
$this->tableName = $this->database->escapeTable($tableName);
$allowOrder = true;
if(strpos($operator, '!') === 0 && $operator !== '!=') {
$this->not = true;
$operator = ltrim($operator, '!');
} else {
// disable orderby statements when calling object will be negating whatever we do
$selector = $this->query->selector;
if($selector instanceof Selector && $selector->not) $allowOrder = false;
}
// if allowOrder has not been specifically set, then set value now
if($this->allowOrder === null) $this->allowOrder = $allowOrder;
if($this->forceLike && isset($this->likeAlternateOperators[$operator])) {
$operator = $this->likeAlternateOperators[$operator];
}
$this->operator = $operator;
foreach($this->methodOperators as $name => $operators) {
if(in_array($operator, $operators)) $this->method = $name;
if($this->method) break;
}
if(!$this->method) {
throw new WireException("Unimplemented operator in $this::match()");
}
if(is_array($fieldName) && count($fieldName) < 2) {
$fieldName = reset($fieldName);
}
if(is_array($fieldName)) {
$this->matchArrayFieldName($fieldName, $value);
} else {
$this->matchFieldName($fieldName, $value);
}
return $this;
}
protected function matchFieldName($fieldName, $value) {
$this->fieldName = $this->database->escapeCol($fieldName);
if(is_array($value)) {
$this->matchArrayValue($value);
} else {
$value = $this->value($value);
$method = $this->method;
if(strlen($value)) {
$this->$method($value);
} else {
// empty value
if($this->not || $this->operator === '!=') {
$this->matchIsNotEmpty();
} else {
$this->matchIsEmpty();
}
}
}
}
/**
* Match when given $fieldName is an array
*
* @param array $fieldNames
* @param mixed $value
* @since 3.0.169
*
*/
protected function matchArrayFieldName(array $fieldNames, $value) {
$query = $this->query;
$query->bindOption('global', true);
$this->query = new DatabaseQuerySelect();
$this->wire($this->query);
$this->query->bindOption(true, $query->bindOption(true));
foreach($fieldNames as $fieldName) {
$this->matchFieldName($fieldName, $value);
}
$query->where('((' . implode(') OR (', $this->query->where) . '))');
$this->query->copyBindValuesTo($query);
$this->query = $query;
}
/**
* Match when given $value is an array
*
* Note: PageFinder uses its own array-to-value conversion, so this case applies only to other usages outside PageFinder,
* such as FieldtypeMulti::getLoadQueryWhere()
*
* @param array $value
* @since 3.0.141
* @throws WireException
*
*/
protected function matchArrayValue(array $value) {
/*
if(strpos($this->operator, '~') !== false) {
throw new WireException("Operator $this->operator is not supported for $this->fieldName with OR value condition");
}
*/
// convert *= operator to %= to make the query possible (avoiding matchContains method)
// if($this->operator === '*=') $this->operator = '%=';
$query = $this->query;
$query->bindOption('global', true);
$this->query = new DatabaseQuerySelect();
$this->wire($this->query);
$this->query->bindOption(true, $query->bindOption(true));
$method = $this->method;
foreach($value as $v) {
$v = $this->value("$v");
if(strlen($v)) $this->$method($v);
}
// @todo need to get anything else from substitute query?
$query->where('((' . implode(') OR (', $this->query->where) . '))');
$this->query->copyBindValuesTo($query);
$this->query = $query;
}
/**
* Match equals, not equals, less, greater, etc.
*
* @param string $value
*
*/
protected function matchEquals($value) {
$op = $this->wire()->database->escapeOperator($this->operator, WireDatabasePDO::operatorTypeComparison);
$this->query->where("$this->tableField$op?", $value);
}
/**
* Match is an empty empty string, null or not present
*
*/
protected function matchIsEmpty() {
$this->query->where("($this->tableField='' OR $this->tableField IS NULL)");
}
/**
* Match is present, not null and not an empty string
*
*/
protected function matchIsNotEmpty() {
$this->query->where("($this->tableField IS NOT NULL AND $this->tableField!='')");
}
/**
* Match LIKE phrase
*
* @param string $value
*
*/
protected function matchLikePhrase($value) {
$likeType = $this->not ? 'NOT LIKE' : 'LIKE';
$this->query->where("$this->tableField $likeType ?", '%' . $this->escapeLike($value) . '%');
}
/**
* Match starts-with or ends-with using only LIKE (no match/against index)
*
* Does not ignore whitespace, closing tags or punctutation at start/end the way that the
* matchStartEnd() method does, so this can be used to perform more literal start/end matches.
*
* @param string $value
*
*/
protected function matchLikeStartEnd($value) {
$likeType = $this->not ? 'NOT LIKE' : 'LIKE';
if(strpos($this->operator, '^') !== false) {
$this->query->where("$this->tableField $likeType ?", $this->escapeLike($value) . '%');
} else {
$this->query->where("$this->tableField $likeType ?", '%' . $this->escapeLike($value));
}
}
/**
* Match words (plural) LIKE, given words can appear in full or in any part of a word
*
* @param string $value
* @since 3.0.160
*
*/
protected function matchLikeWords($value) {
// ~%= Match all words LIKE
// ~|%= Match any words LIKE
$likeType = $this->not ? 'NOT LIKE' : 'LIKE';
$any = strpos($this->operator, '|') !== false;
$words = $this->words($value);
$binds = array(); // used only in $any mode
$wheres = array(); // used only in $any mode
foreach($words as $word) {
$word = $this->escapeLike($word);
if(!strlen($word)) continue;
if($any) {
$bindKey = $this->query->getUniqueBindKey();
$wheres[] = "($this->tableField $likeType $bindKey)";
$binds[$bindKey] = "%$word%";
} else {
$this->query->where("($this->tableField $likeType ?)", "%$word%");
}
}
if($any && count($words)) {
$this->query->where('(' . implode(' OR ', $wheres) . ')');
$this->query->bindValues($binds);
}
}
/**
* Match contains words (full, any or partial)
*
* @param string $value
* @since 3.0.160
*
*/
protected function matchWords($value) {
// ~= Contains all full words
// !~= Does not contain all full words
// ~+= Contains all full words + expand
// ~*= Contains all partial words
// ~~= Contains all words live (all full words + partial last word)
// ~|= Contains any full words
// ~|*= Contains any partial words
// ~|+= Contains any words + expand
$tableField = $this->tableField();
$operator = $this->operator;
$required = strpos($operator, '|') === false;
$partial = strpos($operator, '*') !== false;
$partialLast = $operator === '~~=';
$expand = strpos($operator, '+') !== false;
$matchType = $this->matchType();
$scoreField = $this->getScoreFieldName();
$matchAgainst = '';
$wheres = array();
$data = $this->getBooleanModeWords($value, array(
'required' => $required,
'partial' => $partial,
'partialLast' => $partialLast,
'partialLess' => ($partial || $expand),
'alternates' => $expand,
));
if(empty($data['value'])) {
// query contains no indexable words: force non-match
//$this->query->where('1>2');
//return;
// TEST OUT: title|summary~|+=beer
}
if($expand) {
if(!empty($data['booleanValue']) && $this->allowOrder) {
// ensure full matches are above expanded matches
$preScoreField = $this->getScoreFieldName();
$bindKey = $this->query->bindValueGetKey($data['booleanValue']);
$this->query->select("$matchType($tableField) AGAINST($bindKey IN BOOLEAN MODE) + 111.1 AS $preScoreField");
$this->query->orderby("$preScoreField DESC");
}
if(!empty($data['matchValue'])) {
$bindValue = trim($data['matchValue']);
$bindKey = $this->query->bindValueGetKey($this->escapeAgainst($bindValue));
$matchAgainst = "$matchType($tableField) AGAINST($bindKey WITH QUERY EXPANSION)";
}
} else if(!empty($data['booleanValue'])) {
$bindKey = $this->query->bindValueGetKey($data['booleanValue']);
$matchAgainst = "$matchType($tableField) AGAINST($bindKey IN BOOLEAN MODE)";
}
if($matchAgainst) {
$wheres[] = $matchAgainst;
// $this->query->where($matchAgainst);
if($this->allowOrder) {
$this->query->select("$matchAgainst AS $scoreField");
$this->query->orderby("$scoreField DESC");
}
} else if(!$this->allowStopwords) {
// no match possible
// $this->query->where('1>2');
$wheres[] = '1>2';
}
if(!empty($data['likeWords'])) {
// stopwords or words that were too short to use fulltext index
$likeType = $this->not ? 'NOT RLIKE' : 'RLIKE';
$orLikes = array();
$andLikes = array();
foreach($data['likeWords'] as $word) {
$isStopword = isset($data['stopWords'][$word]);
if($isStopword && !$this->allowStopwords) continue;
if(!strlen($word)) continue;
if($partial || ($partialLast && $word === $data['lastWord'])) {
// just match partial word from beginning
$likeValue = $this->rlikeValue($word);
} else {
// match to word-end
$likeValue = $this->rlikeValue($word, array('partial' => false));
}
$bindKey = $this->query->bindValueGetKey($likeValue);
$likeWhere = "($tableField $likeType $bindKey)";
if(!$required || ($isStopword && $expand)) {
$orLikes[] = $likeWhere;
} else {
$andLikes[] = $likeWhere;
}
}
$whereLike = '';
if(count($orLikes)) {
$whereLike .= '(' . implode(' OR ', $orLikes) . ')';
if(count($andLikes)) $whereLike .= $required ? ' AND ' : ' OR ';
}
if(count($andLikes)) {
$whereLike .= implode(' AND ', $andLikes);
}
if($whereLike) $wheres[] = $whereLike;
}
if(count($wheres)) {
$and = $required ? ' AND ' : ' OR ';
$this->query->where('(' . implode($and, $wheres) . ')');
}
}
/**
* Match contains entire phrase/string (*=)
*
* @param string $value
*
*/
protected function matchPhrase($value) {
$tableField = $this->tableField();
$likeValue = '';
$useLike = false;
$words = $this->words($value);
$lastWord = count($words) > 1 ? array_pop($words) : '';
$badWords = array();
$goodWords = array();
foreach($words as $word) {
if($this->isIndexableWord($word)) {
$goodWords[$word] = $word;
} else {
$badWords[$word] = $word;
}
}
if(count($badWords)) $useLike = true;
if(!count($goodWords)) {
// 0 good words to search: do not use match/against
$againstValue = '';
} else if(count($goodWords) === 1) {
// 1 word left: non-quoted word only, partial match if no last word
$word = reset($goodWords);
$againstValue = '+' . $this->escapeAgainst($word);
if($lastWord === '') $againstValue .= '*';
} else if(!count($badWords)) {
// no bad words, okay to match all in phrase format
$againstValue = '+"' . $this->escapeAgainst(implode(' ', $words)) . '"';
} else {
// combination of good and bad words, match the good words in any order
// and let the LIKE match them as a phrase
$againstValue = $this->escapeAgainst(implode(' ', $goodWords));
$useLike = true;
}
if($useLike || $lastWord !== '' || !strlen($againstValue)) {
// match entire phrase with LIKE as secondary qualifier that includes last word
// so that we can perform a partial match on the last word only. This is necessary
// because we cant use partial match qualifiers in or out of quoted phrases.
$lastWord = strlen($lastWord) ? $this->escapeAgainst($lastWord) : '';
if(strlen($lastWord) && !$this->isStopword($lastWord)) {
// if word is indexable let it contribute to final score
// expand the againstValue to include the last word as a required partial match
$againstValue = trim("$againstValue +$lastWord*");
}
$likeValue = $this->rlikeValue($value);
}
if(strlen($againstValue)) {
// use MATCH/AGAINST
$bindKey = $this->query->bindValueGetKey($againstValue);
$matchType = $this->matchType();
$matchAgainst = "$matchType($tableField) AGAINST($bindKey IN BOOLEAN MODE)";
$this->query->where($matchAgainst);
if($this->allowOrder) {
$scoreField = $this->getScoreFieldName();
$this->query->select("$matchAgainst AS $scoreField");
$this->query->orderby("$scoreField DESC");
}
}
if(strlen($likeValue)) {
// LIKE is used as a secondary qualifier to MATCH/AGAINST so that it is
// performed only on rows already identified from FULLTEXT index, unless
// no MATCH/AGAINST could be created due to stopwords or too-short words
$likeType = $this->not ? 'NOT RLIKE' : 'RLIKE';
$this->query->where("($tableField $likeType ?)", $likeValue);
}
}
/**
* Match phrase with query expansion (*+=)
*
* @param string $value
*
*/
protected function matchPhraseExpand($value) {
$tableField = $this->tableField();
$matchType = $this->matchType();
$words = $this->words($value, array('indexable' => true));
$wordsAlternates = array();
$phraseWords = $this->words($value); // including non-indexable
$lastPhraseWord = array_pop($phraseWords);
$scoreField = $this->getScoreFieldName();
$againstValues = array();
// BOOLEAN PHRASE: full phrase matches come before expanded matches
if(count($phraseWords)) {
$phrases = array();
$phrase = array();
foreach($phraseWords as $word) {
if($this->isIndexableWord($word)) {
$phrase[] = $word;
} else {
if(count($phrase)) {
$phrases[] = $phrase;
$phrase = array();
}
$againstValues[] = $this->escapeAgainst($word) . '*';
}
}
if(count($phrase)) $phrases[] = $phrase;
if(count($phrases)) {
foreach($phrases as $phrase) {
$phraseStr = $this->escapeAgainst(implode(' ', $phrase));
if(count($phrase) > 1) $phraseStr = '"' . $phraseStr . '"';
$againstValues[] = "+$phraseStr";
}
}
}
$againstValues[] = ($this->isIndexableWord($lastPhraseWord) ? '+' : '') . $this->escapeAgainst($lastPhraseWord) . '*';
$bindKey = $this->query->bindValueGetKey(implode(' ', $againstValues));
$matchAgainst = "$matchType($tableField) AGAINST($bindKey IN BOOLEAN MODE)";
if($this->allowOrder) {
$this->query->select("$matchAgainst + 333.3 AS $scoreField");
$this->query->orderby("$scoreField DESC");
}
if(!count($words)) {
// no words to work with for query expansion (not likely, unless stopwords or too-short)
$this->query->where($matchAgainst);
return;
}
// BOOLEAN WEIGHTED WORDS: word matches above query expansion matches
$againstValue = '';
$scoreField = $this->getScoreFieldName();
foreach($words as $word) {
$wordAlternates = array();
foreach($this->getWordAlternates($word) as $w) {
if($w === $word || !$this->isIndexableWord($w)) continue;
$wordAlternates[$w] = $w; // alternates for just this word
$wordsAlternates[$w] = $w; // alternates for all words
}
$word = $this->escapeAgainst($word);
// full word match carries more weight than partial or alternate word match,
// but at least one must be there in order to have a good score
$againstValue .= "+(";
$againstValue .= ">$word $word*";
if(count($wordAlternates)) {
$againstValue .= ' ' . $this->escapeAgainst(implode(' ', $wordAlternates));
}
$wordRoot = $this->getWordRoot($word);
if($wordRoot && $wordRoot !== $word) {
$againstValue .= ' ' . $this->escapeAgainst($wordRoot) . '*';
}
$againstValue .= ") ";
}
if($this->allowOrder && strlen($againstValue)) {
$bindKey = $this->query->bindValueGetKey(trim($againstValue));
$this->query->select("$matchType($tableField) AGAINST($bindKey IN BOOLEAN MODE) + 222.2 AS $scoreField");
$this->query->orderby("$scoreField DESC");
}
// QUERY EXPANSION: regular match/against words with query expansion
$words = array_unique(array_merge($words, $wordsAlternates));
$againstValue = $this->escapeAgainst(implode(' ', $words));
$bindKey = $this->query->bindValueGetKey($againstValue);
$matchAgainst = "$matchType($tableField) AGAINST($bindKey WITH QUERY EXPANSION)";
$this->query->where($matchAgainst);
$scoreField = $this->getScoreFieldName();
$this->query->select("$matchAgainst AS $scoreField");
if($this->allowOrder) {
$this->query->orderby("$scoreField DESC");
}
}
/**
* Perform a regular scored MATCH/AGAINST query (non-boolean)
*
* @param string $value
*
*/
protected function matchRegular($value) {
// **= Contains match
// **+= Contains match + expand
$tableField = $this->tableField();
$expand = strpos($this->operator, '+') !== false;
$matchType = $this->matchType();
if($expand && $this->allowOrder) {
// boolean mode query for sorting purposes
$scoreField = $this->getScoreFieldName();
$data = $this->getBooleanModeWords($value, array(
'partialLess' => true,
'required' => false,
'alternates' => true,
));
if(!empty($data['booleanValue'])) {
$againstValue = $data['booleanValue'];
$bindKey = $this->query->bindValueGetKey($againstValue);
$matchAgainst = "$matchType($tableField) AGAINST($bindKey IN BOOLEAN MODE)";
$this->query->select("$matchAgainst + 111.1 AS $scoreField");
$this->query->orderby("$scoreField DESC");
}
}
// standard MATCH/AGAINST with optional query expansion
$scoreField = $this->getScoreFieldName();
$words = $this->words($value, array('indexable' => true, 'alternates' => $expand));
$againstValue = $this->escapeAgainst(implode(' ', $words));
if(!count($words) || !strlen(trim($againstValue))) {
// query contains no indexable words: force non-match
if(strlen($value)) $this->query->where('1>2');
return;
}
$bindKey = $this->query->bindValueGetKey($againstValue);
$againstType = $expand ? 'WITH QUERY EXPANSION' : '';
$where = "$matchType($tableField) AGAINST($bindKey $againstType)";
$this->query->where($where);
if($this->allowOrder) {
$this->query->select("$where AS $scoreField");
$this->query->orderby("$scoreField DESC");
}
}
/**
* Match phrase at start or end of field value (also uses fulltext index when possible)
*
* Ignores whitespace, punctuation and opening/closing tags, enabling it to match
* start/end words or phrases surrounded by non-word characters.
*
* @param $value
*
*/
protected function matchStartEnd($value) {
// ^= Starts with
// $= Ends with
$tableField = $this->tableField();
$matchStart = strpos($this->operator, '^') !== false;
$againstValue = '';
$words = $this->words($value, array('indexable' => true));
if(count($words)) {
if($matchStart) {
$lastWord = $this->escapeAgainst(array_pop($words));
$againstValue = count($words) ? '+' . $this->escapeAgainst(implode(' +', $words)) : '';
$againstValue = trim("$againstValue +$lastWord*"); // 'partial*' match last word
} else {
array_shift($words); // skip first word since '*partial' match not possible with fulltext
$againstValue = count($words) ? '+' . $this->escapeAgainst(implode(' +', $words)) : '';
}
}
if(strlen($againstValue)) {
// use MATCH/AGAINST to pre-filter before RLIKE when possible
$bindKey = $this->query->bindValueGetKey($againstValue);
$matchType = $this->matchType();
$matchAgainst = "$matchType($tableField) AGAINST($bindKey IN BOOLEAN MODE)";
$scoreField = $this->getScoreFieldName();
$this->query->where($matchAgainst);
if($this->allowOrder) {
$this->query->select("$matchAgainst AS $scoreField");
$this->query->orderby("$scoreField DESC");
}
}
$likeType = $this->not ? 'NOT RLIKE' : 'RLIKE';
if($matchStart) {
// starts with phrase, [optional non-visible html or whitespace] plus query text
$likeValue = $this->rlikeValue($value, array('start' => true));
} else {
// ends with phrase, [optional punctuation and non-visible HTML/whitespace]
$likeValue = $this->rlikeValue($value, array('end' => true));
}
$this->query->where("($tableField $likeType ?)", $likeValue);
}
/**
* Match text using boolean mode commands (Advanced search)
*
* @param string $text
* @since 3.0.160
*
*/
protected function matchCommands($text) {
$tableField = $this->tableField();
$scoreField = $this->getScoreFieldName();
$against = $this->getBooleanModeCommands($text);
$bindKey = $this->query->bindValueGetKey($against);
$matchType = $this->matchType();
$matchAgainst = "$matchType($tableField) AGAINST($bindKey IN BOOLEAN MODE) ";
$this->query->where($matchAgainst);
if($this->allowOrder) {
$select = "$matchAgainst AS $scoreField ";
$this->query->select($select);
$this->query->orderby("$scoreField DESC");
}
}
/**
* Get verbose data array of words identified and prepared for boolean mode
*
* @param string $value
* @param array $options
* - `required` (bool): Are given words required in the query? (default=true)
* - `partial` (bool): Is it okay to match a partial value? i.e. can "will" match "willy" (default=false)
* - `partialLast` (bool): Use partial only for last word? (default=null, auto-detect)
* - `partialLess` (bool): Weight partial match words less than full word match? (default=false)
* - `phrase` (bool): Is entire $value a full phrase to match? (default=auto-detect)
* - `useStopwords` (bool): Allow inclusion of stopwords? (default=null, auto-detect)
* - `alternates` (bool): Get word alternates? (default=null, auto-detect)
* @return array Value provided to the function with boolean operators added, or verbose array.
*
*/
protected function getBooleanModeWords($value, array $options = array()) {
$expand = strpos($this->operator, '+') !== false;
$defaults = array(
'required' => true,
'partial' => false,
'partialLast' => ($this->operator === '~~=' || $this->operator === '^='),
'partialLess' => false,
'useStopwords' => null,
'useShortwords' => null,
'alternates' => $expand,
);
$options = array_merge($defaults, $options);
$minWordLength = (int) $this->database->getVariable('ft_min_word_len');
$originalValue = $value;
$value = $this->escapeAgainst($value);
$booleanValues = array();
$partial = $options['partial'] ? '*' : '';
$required = $options['required'] ? '+' : '';
$useStopwords = is_bool($options['useStopwords']) ? $options['useStopwords'] : $partial === '*';
$useShortwords = is_bool($options['useShortwords']) ? $options['useShortwords'] : $partial === '*';
$lastWord = null;
$goodWords = array();
$stopWords = array();
$shortWords = array();
$likeWords = array();
$altWords = array();
$joinWords = array();
$joiners = array('->', '-', '.', ':');
// get all words
$allWords = $this->words($value);
foreach(explode(' ', $originalValue) as $word) {
foreach($joiners as $joiner) {
if(strpos($word, $joiner)) {
$joinWords[$word] = $word;
$likeWords[$word] = $word;
break;
}
}
}
if($options['partialLast']) {
// treat last word separately (partial last word for live or starts-with searches)
// only last word is partial
$lastWord = end($allWords);
$partial = '';
}
// iterate through all words to build boolean query values
foreach($allWords as $word) {
$length = strlen($word);
if(!$length || isset($booleanValues[$word])) continue;
if($this->isStopword($word)) {
// handle stop-word
$stopWords[$word] = $word;
if($useStopwords && $partial) $booleanValues[$word] = "<$word*";
if($required) $likeWords[$word] = $word;
continue; // do nothing further with stopwords
} else if($length < $minWordLength) {
// handle too-short word
$shortWords[$word] = $word;
if($useShortwords && $partial) $booleanValues[$word] = "$word*";
if($required) $likeWords[$word] = $word;
continue; // do nothing further with short words
} else if($options['partialLess']) {
// handle regular word and match full word (more weight), or partial word (less weight)
$booleanValues[$word] = $required ? "+(>$word $word*)" : "$word*";
$goodWords[$word] = $word;
} else {
// handle regular word
$booleanValues[$word] = $required . $word . $partial;
$goodWords[$word] = $word;
}
if($options['alternates']) {
$booleanValue = $booleanValues[$word];
$alternates = $this->getBooleanModeAlternateWords($word, $booleanValue, $minWordLength, $options);
if($booleanValue !== $booleanValues[$word]) {
$booleanValues[$word] = $booleanValue;
$altWords = array_merge($altWords, $alternates);
$allWords = array_merge($allWords, $altWords);
}
}
}
if(strlen("$lastWord")) {
// only last word allowed to be a partial match word
$lastRequired = isset($stopWords[$lastWord]) || isset($shortWords[$lastWord]) ? '' : $required;
$booleanValues[$lastWord] = $lastRequired . $lastWord . '*';
}
if($useStopwords && !$required && count($stopWords) && count($goodWords)) {
// increase weight of non-stopwords
foreach($goodWords as $word) {
$booleanWord = $booleanValues[$word];
if(!in_array($booleanWord[0], array('(', '+', '<', '>', '-', '~', '"'))) {
$booleanValues[$word] = ">$booleanWord";
}
}
}
$badWords = array_merge($stopWords, $shortWords);
if(count($stopWords)) {
$numOkayWords = count($goodWords) + count($shortWords);
foreach($stopWords as $word) {
$likeWords[$word] = $word;
if($numOkayWords && isset($booleanValues[$word])) {
// make word non-required in boolean query
$booleanValues[$word] = ltrim($booleanValues[$word], '+');
} else {
// boolean query requires at least one good word to work,
// so if there aren't any, remove this word from boolean query
unset($booleanValues[$word]);
}
}
}
return array(
'value' => trim(implode(' ', $allWords)),
'originalValue' => $originalValue,
'matchValue' => trim(implode(' ', $goodWords) . ' ' . implode(' ', $altWords)), // indexable words only
'booleanValue' => trim(implode(' ', $booleanValues)),
'booleanWords' => $booleanValues,
'likeWords' => $likeWords,
'allWords' => $allWords,
'goodWords' => $goodWords,
'badWords' => $badWords,
'stopWords' => $stopWords,
'shortWords' => $shortWords,
'altWords' => $altWords,
'joinWords' => $joinWords,
'lastWord' => $lastWord,
'minWordLength' => $minWordLength,
);
}
/**
* Helper for getBooleanModeWords to handle population of alternate words in boolean value
*
* @param string $word Word to find alternates for
* @param string &$booleanValue Existing boolean value which will be updated
* @param int $minWordLength
* @param array $options
* @return array
* @since 3.0.162
*
*/
protected function getBooleanModeAlternateWords($word, &$booleanValue, $minWordLength, array $options) {
$required = strpos($booleanValue, '+') === 0 ? '+' : '';
$alternateWords = $this->getWordAlternates($word);
$rootWord = $this->getWordRoot($word);
if($rootWord) {
if(!in_array($rootWord, $alternateWords)) {
$alternateWords[] = $rootWord;
} else {
$rootWord = '';
}
}
$alternateWords = array_unique($alternateWords);
$booleanWords = $alternateWords;
// prepare alternate words for inclusion in boolean value and remove any that arent indexable
foreach($alternateWords as $key => $alternateWord) {
$alternateWord = $this->escapeAgainst($alternateWord);
$length = $this->strlen($alternateWord);
if($alternateWord === $rootWord && $length > 1) {
// root word is always partial match. weight less if there are other alternates to match
$less = count($booleanWords) > 1 && !empty($options['partialLess']) ? '<' : '';
$booleanWords[$key] = $less . $alternateWord . '*';
if($length >= $minWordLength && $length >= 3) $booleanWords[] = $less . $alternateWord;
unset($alternateWords[$key]);
} else if($length < $minWordLength || $this->isStopword($alternateWord)) {
// alternate word not indexable, remove it
unset($alternateWords[$key]);
unset($booleanWords[$key]);
} else {
// replace with escaped version
$alternateWords[$key] = $alternateWord;
$booleanWords[$key] = $alternateWord;
}
}
if(!count($booleanWords)) return array();
// rebuild boolean value to include alternates: "+(word word)" or "+word" or ""
if($required) $booleanValue = ltrim($booleanValue, '+');
// remove parens from boolean value, if present
$booleanValue = trim($booleanValue, '()');
// assign higher weight to existing first word, if not already
if($booleanValue && strpos($booleanValue, '>') !== 0) $booleanValue = ">$booleanValue";
// append alternate words
$booleanValue = trim($booleanValue . ' ' . implode(' ', $booleanWords));
// package boolean value into parens and optional "+" prefix (indicating required)
$booleanValue = "$required($booleanValue)";
return $alternateWords;
}
/**
* Get boolean query value where "+" and "-" and "*" and '"' are allowed in query to affect results
*
* @param string $value
* @return string
*
*/
protected function getBooleanModeCommands($value) {
$booleanValues = array();
$value = str_replace(array('“', '”'), '"', $value);
/** @var SelectorContainsAdvanced $selector */
$selector = Selectors::getSelectorByOperator('#=');
$commands = $selector->valueToCommands($value);
foreach($commands as $command) {
$booleanValue = $this->escapeAgainst($command['value']);
if($command['phrase']) $booleanValue = '"' . $booleanValue . '"';
if($command['type']) $booleanValue = $command['type'] . $booleanValue;
if($command['partial']) $booleanValue .= '*';
$booleanValues[] = $booleanValue;
}
return implode(' ', $booleanValues);
}
/**
* Get array of words from given value
*
* @param string $value
* @param array $options
* @return array
*
*/
protected function words($value, array $options = array()) {
$defaults = array(
'keepNumberFormat' => false,
'keepApostrophe' => false,
'minWordLength' => 1, // minimum allowed length or true for ft_min_word_len
'stopwords' => true, // allow stopwords
'indexable' => false, // include only indexable words?
'alternates' => false, // include alternate versions of words?
);
$options = count($options) ? array_merge($defaults, $options) : $defaults;
if($options['minWordLength'] === true) $options['minWordLength'] = (int) $this->database->getVariable('ft_min_word_len');
$words = $this->wire()->sanitizer->wordsArray($value, $options);
if($options['alternates']) {
foreach($words as $word) {
$alts = $this->getWordAlternates($word);
foreach($alts as $alt) {
if(!in_array($alt, $words)) $words[] = $alt;
}
}
}
if($options['indexable']) {
foreach($words as $key => $word) {
if(!$this->isIndexableWord($word)) unset($words[$key]);
}
} else if(!$options['stopwords']) {
foreach($words as $key => $word) {
if($this->isStopword($word)) unset($words[$key]);
}
}
return $words;
}
/**
* Prepare a word or phrase for use in an RLIKE statement
*
* @param string $value
* @param array $options
* @return string
*
*/
protected function rlikeValue($value, array $options = array()) {
$defaults = array(
'start' => false,
'end' => false,
'partial' => true, // partial match at end of
);
$options = array_merge($defaults, $options);
// consider hyphen and space the same for matching purposes (must be before preg_quote)
$value = str_replace('-', ' ', $value);
// escape characters used in regular expressions
$likeValue = preg_quote($value);
if(strpos($likeValue, "'") !== false || strpos($likeValue, "") !== false) {
// match either straight or curly apostrophe
$likeValue = preg_replace('/[\']+/', '(\'|)', $likeValue);
// if word ends with apostrophe then apostrophe is optional
$likeValue = rtrim(str_replace("('|) ", "('|)? ", "$likeValue "));
}
if(strpos($likeValue, ' ') !== false) {
// collapse multiple spaces to just one
while(strpos($likeValue, ' ') !== false) $likeValue = str_replace(' ', ' ', $likeValue);
// hyphen/space can match space or hyphen in any quantity
$likeValue = str_replace(' ', '[- ]+', $likeValue);
}
if($options['start']) {
// given value must match at beginning
$likeValue = '^[[:space:]]*(<[^>]+>)*[[:space:]]*' . $likeValue;
} else if($options['end']) {
// given value must match at end
$likeValue .= '[[:space:]]*[[:punct:]]*[[:space:]]*(<[^>]+>)*[[:space:]]*$';
} else {
// given value can match at beginning of any word boundary in value
if($this->wire()->database->getRegexEngine() === 'ICU') {
list($a, $b) = array("\\b", "\\b");
} else {
list($a, $b) = array('[[:<:]]', '[[:>:]]');
}
$likeValue = "($a|[[:blank:]]|[[:punct:]]|[[:space:]]|^|[-]|>||“|„|«||¿|¡)" . $likeValue;
// if not doing partial matching then must also end at word boundary
if(!$options['partial']) $likeValue .= "($b|[[:blank:]]|[[:punct:]]|[[:space:]]|$|[-]|<||”|»|)";
}
return $likeValue;
}
/**
* @param string $value
* @return int
*
*/
protected function strlen($value) {
if(function_exists('mb_strlen')) {
return mb_strlen($value);
} else {
return strlen($value);
}
}
/**
* Is given word a stopword?
*
* @param string $word
* @return bool
*
*/
protected function isStopword($word) {
if($this->strlen($word) < 2) return true;
return $this->wire()->database->isStopword($word);
}
/**
* Is word too short for fulltext index?
*
* @param string $word
* @return bool
*
*/
protected function isShortword($word) {
$minWordLength = $this->getMinWordLength();
if($minWordLength && $this->strlen($word) < $minWordLength) return true;
return false;
}
/**
* Is given word not a stopword and long enough to be indexed?
*
* @param string $word
* @return bool
*
*/
protected function isIndexableWord($word) {
if($this->isShortword($word)) return false;
if($this->isStopword($word)) return false;
return true;
}
/**
* Get unique score field name
*
* @return string
* @since 3.0.160
*
*/
protected function getScoreFieldName() {
$key = $this->tableName . '_' . $this->fieldName;
self::$scoreCnts[$key] = isset(self::$scoreCnts[$key]) ? self::$scoreCnts[$key] + 1 : 0;
return '_score_' . $key . self::$scoreCnts[$key];
}
/**
* Get minimum allowed indexable word length
*
* @return int
*
*/
protected function getMinWordLength() {
// note: ft_min_word_len is automatically changed to InnoDBs equivalent when applicable
if($this->minWordLength !== null) return $this->minWordLength;
$this->minWordLength = (int) $this->database->getVariable('ft_min_word_len');
return $this->minWordLength;
}
/**
* Get other variations of given word to search (such as plural, singular, lemmas, etc.)
*
* @param string $word
* @param int|null $minLength Minimum length for returned words
* @return array
*
*/
protected function getWordAlternates($word, $minLength = null) {
if($minLength === null) $minLength = $this->getMinWordLength();
return $this->wire()->sanitizer->getTextTools()->getWordAlternates($word, array(
'operator' => $this->operator,
'lowercase' => true,
'minLength' => $minLength,
));
}
/**
* Get root of word (currently not implemented)
*
* @param string $word
* @return string
*
*/
protected function getWordRoot($word) {
if($word) {}
return '';
}
/**
* Call forceLike(true) to force use of LIKE, or omit argument to get current setting
*
* This forces LIKE only for matching operators that have a LIKE equivalent.
* This includes these operators: `*=`, `^=`, `$=`, `~=`, `~|=`.
*
* @param bool|null $forceLike
* @return bool
* @since 3.0.182
*
*/
public function forceLike($forceLike = null) {
if(is_bool($forceLike)) $this->forceLike = $forceLike;
return $this->forceLike;
}
}