Skip to content

Commit 58de38b

Browse files
authored
Fix parentheses for complex OR boolean search expressions (FreshRSS#6672)
* Fix OR parentheses * Pass all tests * Forgotten comment * Minor whitespace * Fix several cases envolving negation * Line length * Fix `OR NOT`
1 parent 666e7b2 commit 58de38b

File tree

4 files changed

+245
-20
lines changed

4 files changed

+245
-20
lines changed

app/Models/BooleanSearch.php

Lines changed: 118 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ class FreshRSS_BooleanSearch {
1111
private array $searches = [];
1212

1313
/**
14-
* @phpstan-var 'AND'|'OR'|'AND NOT'
14+
* @phpstan-var 'AND'|'OR'|'AND NOT'|'OR NOT'
1515
*/
1616
private string $operator;
1717

18-
/** @param 'AND'|'OR'|'AND NOT' $operator */
18+
/** @param 'AND'|'OR'|'AND NOT'|'OR NOT' $operator */
1919
public function __construct(string $input, int $level = 0, string $operator = 'AND', bool $allowUserQueries = true) {
2020
$this->operator = $operator;
2121
$input = trim($input);
@@ -38,6 +38,8 @@ public function __construct(string $input, int $level = 0, string $operator = 'A
3838
}
3939
$this->raw_input = $input;
4040

41+
$input = self::consistentOrParentheses($input);
42+
4143
// Either parse everything as a series of BooleanSearch’s combined by implicit AND
4244
// or parse everything as a series of Search’s combined by explicit OR
4345
$this->parseParentheses($input, $level) || $this->parseOrSegments($input);
@@ -130,6 +132,107 @@ private function parseUserQueryIds(string $input, bool $allowUserQueries = true)
130132
return $input;
131133
}
132134

135+
/**
136+
* Example: 'ab cd OR ef OR "gh ij"' becomes '(ab cd) OR (ef) OR ("gh ij")'
137+
*/
138+
public static function addOrParentheses(string $input): string {
139+
$input = trim($input);
140+
if ($input === '') {
141+
return '';
142+
}
143+
$splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
144+
$ns = count($splits);
145+
if ($ns <= 1) {
146+
return $input;
147+
}
148+
$result = '';
149+
$segment = '';
150+
for ($i = 0; $i < $ns; $i++) {
151+
$segment .= $splits[$i];
152+
if (trim($segment) === '') {
153+
$segment = '';
154+
} elseif (strcasecmp($segment, 'OR') === 0) {
155+
$result .= $segment . ' ';
156+
$segment = '';
157+
} else {
158+
$quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
159+
if ($quotes % 2 === 0) {
160+
$segment = trim($segment);
161+
if (in_array($segment, ['!', '-'], true)) {
162+
$result .= $segment;
163+
} else {
164+
$result .= '(' . $segment . ') ';
165+
}
166+
$segment = '';
167+
}
168+
}
169+
}
170+
$segment = trim($segment);
171+
if (in_array($segment, ['!', '-'], true)) {
172+
$result .= $segment;
173+
} elseif ($segment !== '') {
174+
$result .= '(' . $segment . ')';
175+
}
176+
return trim($result);
177+
}
178+
179+
/**
180+
* If the query contains a mix of `OR` expressions with and without parentheses,
181+
* then add parentheses to make the query consistent.
182+
* Example: '(ab (cd OR ef)) OR gh OR ij OR (kl)' becomes '(ab ((cd) OR (ef))) OR (gh) OR (ij) OR (kl)'
183+
*/
184+
public static function consistentOrParentheses(string $input): string {
185+
if (!preg_match('/(?<!\\\\)\\(/', $input)) {
186+
// No unescaped parentheses in the input
187+
return trim($input);
188+
}
189+
$parenthesesCount = 0;
190+
$result = '';
191+
$segment = '';
192+
$length = strlen($input);
193+
194+
for ($i = 0; $i < $length; $i++) {
195+
$c = $input[$i];
196+
$backslashed = $i >= 1 ? $input[$i - 1] === '\\' : false;
197+
if (!$backslashed) {
198+
if ($c === '(') {
199+
if ($parenthesesCount === 0) {
200+
if ($segment !== '') {
201+
$result = rtrim($result) . ' ' . self::addOrParentheses($segment);
202+
$negation = preg_match('/[!-]$/', $result);
203+
if (!$negation) {
204+
$result .= ' ';
205+
}
206+
$segment = '';
207+
}
208+
$c = '';
209+
}
210+
$parenthesesCount++;
211+
} elseif ($c === ')') {
212+
$parenthesesCount--;
213+
if ($parenthesesCount === 0) {
214+
$segment = self::consistentOrParentheses($segment);
215+
if ($segment !== '') {
216+
$result .= '(' . $segment . ')';
217+
$segment = '';
218+
}
219+
$c = '';
220+
}
221+
}
222+
}
223+
$segment .= $c;
224+
}
225+
if (trim($segment) !== '') {
226+
$result = rtrim($result);
227+
$negation = preg_match('/[!-]$/', $segment);
228+
if (!$negation) {
229+
$result .= ' ';
230+
}
231+
$result .= self::addOrParentheses($segment);
232+
}
233+
return trim($result);
234+
}
235+
133236
/** @return bool True if some parenthesis logic took over, false otherwise */
134237
private function parseParentheses(string $input, int $level): bool {
135238
$input = trim($input);
@@ -146,9 +249,14 @@ private function parseParentheses(string $input, int $level): bool {
146249
$hasParenthesis = true;
147250

148251
$before = trim($before);
149-
if (preg_match('/[!-]$/i', $before)) {
252+
if (preg_match('/[!-]$/', $before)) {
150253
// Trim trailing negation
151-
$before = substr($before, 0, -1);
254+
$before = rtrim($before, ' !-');
255+
$isOr = preg_match('/\bOR$/i', $before);
256+
if ($isOr) {
257+
// Trim trailing OR
258+
$before = substr($before, 0, -2);
259+
}
152260

153261
// The text prior to the negation is a BooleanSearch
154262
$searchBefore = new FreshRSS_BooleanSearch($before, $level + 1, $nextOperator);
@@ -157,8 +265,8 @@ private function parseParentheses(string $input, int $level): bool {
157265
}
158266
$before = '';
159267

160-
// The next BooleanSearch will have to be combined with AND NOT instead of default AND
161-
$nextOperator = 'AND NOT';
268+
// The next BooleanSearch will have to be combined with AND NOT or OR NOT instead of default AND
269+
$nextOperator = $isOr ? 'OR NOT' : 'AND NOT';
162270
} elseif (preg_match('/\bOR$/i', $before)) {
163271
// Trim trailing OR
164272
$before = substr($before, 0, -2);
@@ -212,7 +320,7 @@ private function parseParentheses(string $input, int $level): bool {
212320
$i++;
213321
}
214322
// $sub = trim($sub);
215-
// if ($sub != '') {
323+
// if ($sub !== '') {
216324
// // TODO: Consider throwing an error or warning in case of non-matching parenthesis
217325
// }
218326
// } elseif ($c === ')') {
@@ -249,12 +357,11 @@ private function parseOrSegments(string $input): void {
249357
return;
250358
}
251359
$splits = preg_split('/\b(OR)\b/i', $input, -1, PREG_SPLIT_DELIM_CAPTURE) ?: [];
252-
253360
$segment = '';
254361
$ns = count($splits);
255362
for ($i = 0; $i < $ns; $i++) {
256363
$segment = $segment . $splits[$i];
257-
if (trim($segment) == '' || strcasecmp($segment, 'OR') === 0) {
364+
if (trim($segment) === '' || strcasecmp($segment, 'OR') === 0) {
258365
$segment = '';
259366
} else {
260367
$quotes = substr_count($segment, '"') + substr_count($segment, '&quot;');
@@ -266,7 +373,7 @@ private function parseOrSegments(string $input): void {
266373
}
267374
}
268375
$segment = trim($segment);
269-
if ($segment != '') {
376+
if ($segment !== '') {
270377
$this->searches[] = new FreshRSS_Search($segment);
271378
}
272379
}
@@ -280,7 +387,7 @@ public function searches(): array {
280387
return $this->searches;
281388
}
282389

283-
/** @return 'AND'|'OR'|'AND NOT' depending on how this BooleanSearch should be combined */
390+
/** @return 'AND'|'OR'|'AND NOT'|'OR NOT' depending on how this BooleanSearch should be combined */
284391
public function operator(): string {
285392
return $this->operator;
286393
}

app/Models/Entry.php

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -577,12 +577,20 @@ public function matches(FreshRSS_BooleanSearch $booleanSearch): bool {
577577
foreach ($booleanSearch->searches() as $filter) {
578578
if ($filter instanceof FreshRSS_BooleanSearch) {
579579
// BooleanSearches are combined by AND (default) or OR or AND NOT (special cases) operators and are recursive
580-
if ($filter->operator() === 'OR') {
581-
$ok |= $this->matches($filter);
582-
} elseif ($filter->operator() === 'AND NOT') {
583-
$ok &= !$this->matches($filter);
584-
} else { // AND
585-
$ok &= $this->matches($filter);
580+
switch ($filter->operator()) {
581+
case 'OR':
582+
$ok |= $this->matches($filter);
583+
break;
584+
case 'OR NOT':
585+
$ok |= !$this->matches($filter);
586+
break;
587+
case 'AND NOT':
588+
$ok &= !$this->matches($filter);
589+
break;
590+
case 'AND':
591+
default:
592+
$ok &= $this->matches($filter);
593+
break;
586594
}
587595
} elseif ($filter instanceof FreshRSS_Search) {
588596
// Searches are combined by OR and are not recursive

app/Models/EntryDAO.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -762,7 +762,7 @@ public static function sqlBooleanSearch(string $alias, FreshRSS_BooleanSearch $f
762762
if ($filterSearch !== '') {
763763
if ($search !== '') {
764764
$search .= $filter->operator();
765-
} elseif ($filter->operator() === 'AND NOT') {
765+
} elseif (in_array($filter->operator(), ['AND NOT', 'OR NOT'], true)) {
766766
// Special case if we start with a negation (there is already the default AND before)
767767
$search .= ' NOT';
768768
}

tests/app/Models/SearchTest.php

Lines changed: 112 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,13 +284,60 @@ public function provideMultipleSearch(): array {
284284
);
285285
}
286286

287+
/**
288+
* @dataProvider provideAddOrParentheses
289+
*/
290+
public function test__addOrParentheses(string $input, string $output): void {
291+
self::assertEquals($output, FreshRSS_BooleanSearch::addOrParentheses($input));
292+
}
293+
294+
/** @return array<array{string,string}> */
295+
public function provideAddOrParentheses(): array {
296+
return [
297+
['ab', 'ab'],
298+
['ab cd', 'ab cd'],
299+
['!ab -cd', '!ab -cd'],
300+
['ab OR cd', '(ab) OR (cd)'],
301+
['!ab OR -cd', '(!ab) OR (-cd)'],
302+
['ab cd OR ef OR "gh ij"', '(ab cd) OR (ef) OR ("gh ij")'],
303+
['ab (!cd)', 'ab (!cd)'],
304+
];
305+
}
306+
307+
/**
308+
* @dataProvider provideconsistentOrParentheses
309+
*/
310+
public function test__consistentOrParentheses(string $input, string $output): void {
311+
self::assertEquals($output, FreshRSS_BooleanSearch::consistentOrParentheses($input));
312+
}
313+
314+
/** @return array<array{string,string}> */
315+
public function provideconsistentOrParentheses(): array {
316+
return [
317+
['ab cd ef', 'ab cd ef'],
318+
['(ab cd ef)', '(ab cd ef)'],
319+
['("ab cd" ef)', '("ab cd" ef)'],
320+
['"ab cd" (ef gh) "ij kl"', '"ab cd" (ef gh) "ij kl"'],
321+
['ab (!cd)', 'ab (!cd)'],
322+
['ab !(cd)', 'ab !(cd)'],
323+
['(ab) -(cd)', '(ab) -(cd)'],
324+
['ab cd OR ef OR "gh ij"', 'ab cd OR ef OR "gh ij"'],
325+
['"plain or text" OR (cd)', '("plain or text") OR (cd)'],
326+
['(ab) OR cd OR ef OR (gh)', '(ab) OR (cd) OR (ef) OR (gh)'],
327+
['(ab (cd OR ef)) OR gh OR ij OR (kl)', '(ab (cd OR ef)) OR (gh) OR (ij) OR (kl)'],
328+
['(ab (cd OR ef OR (gh))) OR ij', '(ab ((cd) OR (ef) OR (gh))) OR (ij)'],
329+
['(ab (!cd OR ef OR (gh))) OR ij', '(ab ((!cd) OR (ef) OR (gh))) OR (ij)'],
330+
['(ab !(cd OR ef OR !(gh))) OR ij', '(ab !((cd) OR (ef) OR !(gh))) OR (ij)'],
331+
];
332+
}
333+
287334
/**
288335
* @dataProvider provideParentheses
289336
* @param array<string> $values
290337
*/
291-
public function test__construct_parentheses(string $input, string $sql, array $values): void {
338+
public function test__parentheses(string $input, string $sql, array $values): void {
292339
[$filterValues, $filterSearch] = FreshRSS_EntryDAOPGSQL::sqlBooleanSearch('e.', new FreshRSS_BooleanSearch($input));
293-
self::assertEquals($sql, $filterSearch);
340+
self::assertEquals(trim($sql), trim($filterSearch));
294341
self::assertEquals($values, $filterValues);
295342
}
296343

@@ -337,6 +384,69 @@ public function provideParentheses(): array {
337384
'(e.title LIKE ? )',
338385
['%"hello world"%'],
339386
],
387+
[
388+
'(ab) OR (cd) OR (ef)',
389+
'(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
390+
['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
391+
],
392+
[
393+
'("plain or text") OR (cd)',
394+
'(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
395+
['%plain or text%', '%plain or text%', '%cd%', '%cd%'],
396+
],
397+
[
398+
'"plain or text" OR cd',
399+
'((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )',
400+
['%plain or text%', '%plain or text%', '%cd%', '%cd%'],
401+
],
402+
[
403+
'"plain OR text" OR cd',
404+
'((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ) ',
405+
['%plain OR text%', '%plain OR text%', '%cd%', '%cd%'],
406+
],
407+
[
408+
'ab OR cd OR (ef)',
409+
'(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ',
410+
['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
411+
],
412+
[
413+
'ab OR cd OR ef',
414+
'((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )',
415+
['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
416+
],
417+
[
418+
'(ab) cd OR ef OR (gh)',
419+
'(((e.title LIKE ? OR e.content LIKE ?) )) AND (((e.title LIKE ? OR e.content LIKE ?) )) ' .
420+
'OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
421+
['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%'],
422+
],
423+
[
424+
'(ab) OR cd OR ef OR (gh)',
425+
'(((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ' .
426+
'OR (((e.title LIKE ? OR e.content LIKE ?) )) OR (((e.title LIKE ? OR e.content LIKE ?) ))',
427+
['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%'],
428+
],
429+
[
430+
'ab OR (!(cd OR ef))',
431+
'(((e.title LIKE ? OR e.content LIKE ?) )) OR (NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) )))',
432+
['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
433+
],
434+
[
435+
'ab !(cd OR ef)',
436+
'(((e.title LIKE ? OR e.content LIKE ?) )) AND NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))',
437+
['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
438+
],
439+
[
440+
'ab OR !(cd OR ef)',
441+
'(((e.title LIKE ? OR e.content LIKE ?) )) OR NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))',
442+
['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%'],
443+
],
444+
[
445+
'(ab (!cd OR ef OR (gh))) OR !(ij OR kl)',
446+
'((((e.title LIKE ? OR e.content LIKE ?) )) AND (((e.title NOT LIKE ? AND e.content NOT LIKE ? )) OR (((e.title LIKE ? OR e.content LIKE ?) )) ' .
447+
'OR (((e.title LIKE ? OR e.content LIKE ?) )))) OR NOT (((e.title LIKE ? OR e.content LIKE ?) ) OR ((e.title LIKE ? OR e.content LIKE ?) ))',
448+
['%ab%', '%ab%', '%cd%', '%cd%', '%ef%', '%ef%', '%gh%', '%gh%', '%ij%', '%ij%', '%kl%', '%kl%'],
449+
],
340450
];
341451
}
342452
}

0 commit comments

Comments
 (0)