Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Prometheus fast regexp #4329

Merged
merged 11 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
* [ENHANCEMENT] Add `invalid_utf8` to reasons spanmetrics will discard spans. [#4293](https://github.com/grafana/tempo/pull/4293) (@zalegrala)
* [ENHANCEMENT] Reduce frontend and querier allocations by dropping HTTP headers early in the pipeline. [#4298](https://github.com/grafana/tempo/pull/4298) (@joe-elliott)
* [ENHANCEMENT] Reduce ingester working set by improving prelloc behavior. [#4344](https://github.com/grafana/tempo/pull/4344) (@joe-elliott)
* [ENHANCEMENT] Use Promtheus fast regexp for TraceQL regular expression matchers. [#4329](https://github.com/grafana/tempo/pull/4329) (@joe-elliott)
**BREAKING CHANGE** All regular expression matchers will now be fully anchored. `span.foo =~ "bar"` will now be evaluated as `span.foo =~ "^bar$"`
* [BUGFIX] Replace hedged requests roundtrips total with a counter. [#4063](https://github.com/grafana/tempo/pull/4063) [#4078](https://github.com/grafana/tempo/pull/4078) (@galalen)
* [BUGFIX] Metrics generators: Correctly drop from the ring before stopping ingestion to reduce drops during a rollout. [#4101](https://github.com/grafana/tempo/pull/4101) (@joe-elliott)
* [BUGFIX] Correctly handle 400 Bad Request and 404 Not Found in gRPC streaming [#4144](https://github.com/grafana/tempo/pull/4144) (@mapno)
Expand Down
3 changes: 2 additions & 1 deletion docs/sources/tempo/traceql/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,8 @@ The implemented comparison operators are:
- `=~` (regular expression)
- `!~` (negated regular expression)

TraceQL uses Golang regular expressions. Online regular expression testing sites like https://regex101.com/ are convenient to validate regular expressions used in TraceQL queries.
TraceQL uses Golang regular expressions. Online regular expression testing sites like https://regex101.com/ are convenient to validate regular expressions used in TraceQL queries. All regular expressions
are treated as fully anchored. For example, `span.foo =~ "bar"` is evaluated as `span.foo =~ "^bar$"`.

For example, to find all traces where an `http.status_code` attribute in a span are greater than `400` but less than equal to `500`:

Expand Down
62 changes: 14 additions & 48 deletions pkg/parquetquery/predicates.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ package parquetquery
import (
"bytes"
"fmt"
"regexp"
"strings"

"github.com/grafana/tempo/pkg/regexp"

pq "github.com/parquet-go/parquet-go"
)

Expand Down Expand Up @@ -88,9 +89,7 @@ func (p *StringInPredicate) KeepPage(pq.Page) bool {
}

type regexPredicate struct {
regs []*regexp.Regexp
matches map[string]bool
shouldMatch bool
matcher *regexp.Regexp
}

var _ Predicate = (*regexPredicate)(nil)
Expand All @@ -108,66 +107,33 @@ func NewRegexNotInPredicate(regs []string) (Predicate, error) {
}

func newRegexPredicate(regs []string, shouldMatch bool) (Predicate, error) {
p := &regexPredicate{
regs: make([]*regexp.Regexp, 0, len(regs)),
matches: make(map[string]bool),
shouldMatch: shouldMatch,
}
for _, reg := range regs {
r, err := regexp.Compile(reg)
if err != nil {
return nil, err
}
p.regs = append(p.regs, r)
m, err := regexp.NewRegexp(regs, shouldMatch)
if err != nil {
return nil, err
}
return p, nil

return &regexPredicate{
matcher: m,
}, nil
}

func (p *regexPredicate) String() string {
var strings string
for _, s := range p.regs {
strings += fmt.Sprintf("%s, ", s.String())
}
return fmt.Sprintf("RegexInPredicate{%s}", strings)
return fmt.Sprintf("RegexPredicate{%s}", p.matcher.String())
}

func (p *regexPredicate) keep(v *pq.Value) bool {
if v.IsNull() {
return false
}

b := v.ByteArray()

// Check uses zero alloc optimization of map[string([]byte)]
if matched, ok := p.matches[string(b)]; ok {
return matched
}

matched := false
for _, r := range p.regs {
if r.Match(b) == p.shouldMatch {
matched = true
break
}
}

// Only alloc the string when updating the map
p.matches[string(b)] = matched

return matched
return p.matcher.Match(v.ByteArray())
}

func (p *regexPredicate) KeepColumnChunk(cc *ColumnChunkHelper) bool {
d := cc.Dictionary()

// Reset match cache on each row group change
// Use exact size of the incoming dictionary
// if present and larger.
count := len(p.matches)
if d != nil && d.Len() > count {
count = d.Len()
}
p.matches = make(map[string]bool, count)
// should we do this?
p.matcher.Reset()

if d != nil {
return keepDictionary(d, p.KeepValue)
Expand Down
120 changes: 120 additions & 0 deletions pkg/regexp/regexp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package regexp

import (
"fmt"
"regexp"
"unsafe"

"github.com/prometheus/prometheus/model/labels"
)

// in order to prevent building an enormous map on extremely high cardinality fields we institute a max
// this number is not tuned. on extremely high cardinality fields memoization is wasteful b/c we rarely
// see the same value 2x. this is all overhead. on lower cardinality fields with an expensive regex memoization
// is very effective at speeding up queries.
const maxMemoize = 1000

type Regexp struct {
matchers []*labels.FastRegexMatcher
matches map[string]bool
shouldMatch bool
}

func NewRegexp(regexps []string, shouldMatch bool) (*Regexp, error) {
matchers := make([]*labels.FastRegexMatcher, 0, len(regexps))

for _, r := range regexps {
m, err := labels.NewFastRegexMatcher(r)
if err != nil {
return nil, err
}
matchers = append(matchers, m)
}

// only memoize if there's a unoptimized matcher
// TODO: should we limit memoization to N values?
var matches map[string]bool
for _, m := range matchers {
if shouldMemoize(m) {
matches = make(map[string]bool)
break
}
}

return &Regexp{
matchers: matchers,
matches: matches,
shouldMatch: shouldMatch,
}, nil
}

func (r *Regexp) Match(b []byte) bool {
return r.MatchString(unsafe.String(unsafe.SliceData(b), len(b)))
}

func (r *Regexp) MatchString(s string) bool {
// if we're memoizing check existing matches
if r.matches != nil {
if matched, ok := r.matches[s]; ok {
return matched
}
}

matched := false
for _, m := range r.matchers {
if m.MatchString(s) == r.shouldMatch {
matched = true
break
}
}

if r.matches != nil && len(r.matches) < maxMemoize {
r.matches[s] = matched
}

return matched
}

func (r *Regexp) Reset() {
if r.matches != nil {
clear(r.matches)
}
}

func (r *Regexp) String() string {
var strings string
for _, m := range r.matchers {
strings += fmt.Sprintf("%s, ", m.GetRegexString())
}

return strings
}

// shouldMemoize returns true if we believe that memoizing this regex would be faster
// the evaluating it directly. see thoughts below.
func shouldMemoize(m *labels.FastRegexMatcher) bool {
// matches labels.FastRegexMatcher
type cheatToSeeInternals struct {
reString string
re *regexp.Regexp

setMatches []string
stringMatcher labels.StringMatcher
prefix string
suffix string
contains []string

matchString func(string) bool
}

cheat := (*cheatToSeeInternals)(unsafe.Pointer(m))

// TODO: research and improve this. we're making a guess on whether an optimization will improve the regex
// performance enough that its faster to not memoize. See compileMatchStringFunction() in the labels
// package. maybe there's even a way to do this dynamically?
return cheat.stringMatcher == nil && // a stringMatcher definitively rejects or accepts. if a string matcher is present the regex will never be executed
len(cheat.setMatches) == 0 && // setMatches definitively reject or accept. if len != 0 the regex will never be executed, but perhaps if there are a very large # of setMatches we prefer memoizing anyway?
cheat.prefix == "" && // prefix and suffix _do not_ prevent the regex from executing, but they are quick to evaluate and tend to nicely filter down.
cheat.suffix == "" // perhaps a length requirement would be an improvement? i.e. require a prefix or suffix of at least 3 chars?
// len(cheat.contains) == 0 // in testing, it was faster to memoize with a contains filter. perhaps if the filters are long enough we don't memoize?
}
64 changes: 64 additions & 0 deletions pkg/regexp/regexp_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package regexp

import (
"testing"

"github.com/prometheus/prometheus/model/labels"
"github.com/stretchr/testify/require"
)

func TestRegexpMatch(t *testing.T) {
r, err := NewRegexp([]string{"^a.*"}, true)
require.NoError(t, err)

require.True(t, r.Match([]byte("abc")))
require.True(t, r.MatchString("abc"))
require.False(t, r.MatchString("xyz"))

r, err = NewRegexp([]string{"^a.*"}, false)
require.NoError(t, err)

require.False(t, r.Match([]byte("abc")))
require.False(t, r.MatchString("abc"))
require.True(t, r.MatchString("xyz"))
}

func TestShouldMemoize(t *testing.T) {
tcs := []struct {
regex string
shouldMemoize bool
}{
{
regex: ".*",
shouldMemoize: false,
},
{
regex: "foo.*",
shouldMemoize: false,
},
{
regex: ".*bar",
shouldMemoize: false,
},
{
regex: "foo|bar",
shouldMemoize: false,
},
{
regex: ".*bar.*", // creates a containsStringMatcher so should not memoize
shouldMemoize: false,
},
{
regex: ".*bar.*foo.*", // calls contains in order
shouldMemoize: true,
},
}

for _, tc := range tcs {
t.Run(tc.regex, func(t *testing.T) {
m, err := labels.NewFastRegexMatcher(tc.regex)
require.NoError(t, err)
require.Equal(t, tc.shouldMemoize, shouldMemoize(m))
})
}
}
2 changes: 1 addition & 1 deletion pkg/traceql/ast.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ import (
"fmt"
"hash/fnv"
"math"
"regexp"
"slices"
"time"
"unsafe"

"github.com/grafana/tempo/pkg/regexp"
"github.com/grafana/tempo/pkg/tempopb"
)

Expand Down
9 changes: 5 additions & 4 deletions pkg/traceql/ast_execute.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ import (
"errors"
"fmt"
"math"
"regexp"
"strings"

"github.com/grafana/tempo/pkg/regexp"
)

var errSpansetOperationMultiple = errors.New("spanset operators are not supported for multiple spansets per trace. consider using coalesce()")
Expand Down Expand Up @@ -392,7 +393,7 @@ func (o *BinaryOperation) execute(span Span) (Static, error) {
return NewStaticBool(strings.Compare(lhs.String(), rhs.String()) <= 0), nil
case OpRegex:
if o.compiledExpression == nil {
o.compiledExpression, err = regexp.Compile(rhs.EncodeToString(false))
o.compiledExpression, err = regexp.NewRegexp([]string{rhs.EncodeToString(false)}, true)
if err != nil {
return NewStaticNil(), err
}
Expand All @@ -402,14 +403,14 @@ func (o *BinaryOperation) execute(span Span) (Static, error) {
return NewStaticBool(matched), err
case OpNotRegex:
if o.compiledExpression == nil {
o.compiledExpression, err = regexp.Compile(rhs.EncodeToString(false))
o.compiledExpression, err = regexp.NewRegexp([]string{rhs.EncodeToString(false)}, false)
if err != nil {
return NewStaticNil(), err
}
}

matched := o.compiledExpression.MatchString(lhs.EncodeToString(false))
return NewStaticBool(!matched), err
return NewStaticBool(matched), err
default:
}
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/traceql/ast_execute_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ func TestSpansetOperationEvaluateArray(t *testing.T) {
},
},
{
"{ .foo =~ `ba` }", // match string array with regex
"{ .foo =~ `ba.*` }", // match string array with regex
[]*Spanset{
{Spans: []Span{
&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"bar", "baz"})}},
Expand All @@ -591,7 +591,7 @@ func TestSpansetOperationEvaluateArray(t *testing.T) {
},
},
{
"{ .foo !~ `ba` }", // regex non-matching
"{ .foo !~ `ba.*` }", // regex non-matching
[]*Spanset{
{Spans: []Span{
&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"foo", "baz"})}},
Expand Down Expand Up @@ -953,7 +953,7 @@ func TestSpansetOperationEvaluateArraySymmetric(t *testing.T) {
},
},
{
"{ `ba` =~ .foo }", // Symmetric regex match for string array
"{ `ba.*` =~ .foo }", // Symmetric regex match for string array
[]*Spanset{
{Spans: []Span{
&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"bar", "baz"})}},
Expand All @@ -969,7 +969,7 @@ func TestSpansetOperationEvaluateArraySymmetric(t *testing.T) {
},
},
{
"{ `ba` !~ .foo }", // Symmetric regex non-match for string array
"{ `ba.*` !~ .foo }", // Symmetric regex non-match for string array
[]*Spanset{
{Spans: []Span{
&mockSpan{id: []byte{1}, attributes: map[Attribute]Static{NewAttribute("foo"): NewStaticStringArray([]string{"foo", "baz"})}},
Expand Down
Loading