Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(blooms): Exclude label filters where label name is part of the series labels. #14661

Merged
merged 8 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions pkg/bloomgateway/bloomgateway_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ func groupRefs(t *testing.T, chunkRefs []*logproto.ChunkRef) []*logproto.Grouped
t.Helper()
grouped := groupChunkRefs(nil, chunkRefs, nil)
// Put fake labels to the series
for i, g := range grouped {
for _, g := range grouped {
g.Labels = &logproto.IndexSeries{
Labels: logproto.FromLabelsToLabelAdapters(labels.FromStrings("foo", fmt.Sprintf("%d", i))),
Labels: logproto.FromLabelsToLabelAdapters(labels.FromStrings("foo", fmt.Sprintf("%d", g.Fingerprint))),
}
}

Expand Down Expand Up @@ -305,17 +305,17 @@ func TestBloomGateway_FilterChunkRefs(t *testing.T) {
{From: 1696248000000, Through: 1696251600000, Checksum: 2},
{From: 1696244400000, Through: 1696248000000, Checksum: 4},
}, Labels: &logproto.IndexSeries{
Labels: logproto.FromLabelsToLabelAdapters(labels.FromStrings("foo", "0")),
Labels: logproto.FromLabelsToLabelAdapters(labels.FromStrings("foo", "1000")),
}},
{Fingerprint: 2000, Tenant: tenantID, Refs: []*logproto.ShortRef{
{From: 1696255200000, Through: 1696258800000, Checksum: 3},
}, Labels: &logproto.IndexSeries{
Labels: logproto.FromLabelsToLabelAdapters(labels.FromStrings("foo", "1")),
Labels: logproto.FromLabelsToLabelAdapters(labels.FromStrings("foo", "2000")),
}},
{Fingerprint: 3000, Tenant: tenantID, Refs: []*logproto.ShortRef{
{From: 1696240800000, Through: 1696244400000, Checksum: 1},
}, Labels: &logproto.IndexSeries{
Labels: logproto.FromLabelsToLabelAdapters(labels.FromStrings("foo", "2")),
Labels: logproto.FromLabelsToLabelAdapters(labels.FromStrings("foo", "3000")),
}},
},
}, res)
Expand Down
1 change: 1 addition & 0 deletions pkg/bloomgateway/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ func (e extractor) Extract(start, end int64, r resultscache.Response, _, _ int64
if len(refs) > 0 {
chunkRefs = append(chunkRefs, &logproto.GroupedChunkRefs{
Fingerprint: chunkRef.Fingerprint,
Labels: chunkRef.Labels,
Tenant: chunkRef.Tenant,
Refs: refs,
})
Expand Down
1 change: 1 addition & 0 deletions pkg/bloomgateway/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ func mergeSeries(input [][]*logproto.GroupedChunkRefs, buf []*logproto.GroupedCh
}
return &logproto.GroupedChunkRefs{
Fingerprint: a.Fingerprint,
Labels: a.Labels,
Tenant: a.Tenant,
Refs: mergeChunkSets(a.Refs, b.Refs),
}
Expand Down
1 change: 1 addition & 0 deletions pkg/logproto/compat.go
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ func (m *FilterChunkRefRequest) WithStartEndForCache(start, end time.Time) resul
if len(refs) > 0 {
chunkRefs = append(chunkRefs, &GroupedChunkRefs{
Fingerprint: chunkRef.Fingerprint,
Labels: chunkRef.Labels,
Tenant: chunkRef.Tenant,
Refs: refs,
})
Expand Down
43 changes: 19 additions & 24 deletions pkg/storage/bloom/v1/bloom_tester.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,45 +157,40 @@ func (sm stringMatcherTest) Matches(series labels.Labels, bloom filter.Checker)
// 2. It should be possible to test for just the key

var (
combined = fmt.Sprintf("%s=%s", sm.matcher.Key, sm.matcher.Value)

rawKey = unsafe.Slice(unsafe.StringData(sm.matcher.Key), len(sm.matcher.Key))
combined = fmt.Sprintf("%s=%s", sm.matcher.Key, sm.matcher.Value)
rawCombined = unsafe.Slice(unsafe.StringData(combined), len(combined))
)

return sm.match(series, bloom, rawKey, rawCombined)
return sm.match(series, bloom, rawCombined)
}

func (sm stringMatcherTest) MatchesWithPrefixBuf(series labels.Labels, bloom filter.Checker, buf []byte, prefixLen int) bool {
var (
combined = fmt.Sprintf("%s=%s", sm.matcher.Key, sm.matcher.Value)

prefixedKey = appendToBuf(buf, prefixLen, sm.matcher.Key)
combined = fmt.Sprintf("%s=%s", sm.matcher.Key, sm.matcher.Value)
prefixedCombined = appendToBuf(buf, prefixLen, combined)
)

return sm.match(series, bloom, prefixedKey, prefixedCombined)
return sm.match(series, bloom, prefixedCombined)
}

func (sm stringMatcherTest) match(series labels.Labels, bloom filter.Checker, key []byte, combined []byte) bool {
// If the label is part of the series, we cannot check the bloom since
// the label is not structured metadata
if value := series.Get(sm.matcher.Key); value != "" {
// If the series label value is the same as the matcher value, we cannot filter out this chunk.
// Otherwise, we can filter out this chunk.
// E.g. `{env="prod"} | env="prod"` should not filter out the chunk.
// E.g. `{env="prod"} | env="dev"` should filter out the chunk.
// E.g. `{env="prod"} | env=""` should filter out the chunk.
return value == sm.matcher.Value
// match returns true if the series matches the matcher or is in the bloom filter.
// TODO(salvacorts): support filtering out chunks for labels overriden by structurdd metadata.
// We'd need passing a list of structured metadata fields similarly to how we pass the series.
// SEE: https://github.com/grafana/loki/pull/14661#discussion_r1824228343
func (sm stringMatcherTest) match(series labels.Labels, bloom filter.Checker, combined []byte) bool {
// If we don't have the series labels, we cannot disambiguate which labels come from the series in which case
// we may filter out chunks for queries like `{env="prod"} | env="prod"` if env=prod is not structured metadata
if len(series) == 0 {
return true
}

// To this point we know the label is structured metadata so if the label name is not
// in the bloom, we can filter out the chunk.
if !bloom.Test(key) {
return false
}
// It's in the series if the key is set and has the same value.
// By checking val != "" we handle `{env="prod"} | user=""`.
val := series.Get(sm.matcher.Key)
inSeries := val != "" && val == sm.matcher.Value

return bloom.Test(combined)
inBloom := bloom.Test(combined)
return inSeries || inBloom
}

// appendToBuf is the equivalent of append(buf[:prefixLen], str). len(buf) must
Expand Down
29 changes: 24 additions & 5 deletions pkg/storage/bloom/v1/bloom_tester_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ func TestLabelMatchersToBloomTest(t *testing.T) {
tokenizer,
push.LabelAdapter{Name: "trace_id", Value: "exists_1"},
push.LabelAdapter{Name: "trace_id", Value: "exists_2"},
push.LabelAdapter{Name: "app", Value: "other"},
)
)

series := labels.FromStrings("app", "fake")
series := labels.FromStrings("env", "prod", "app", "fake")
tt := []struct {
name string
query string
Expand Down Expand Up @@ -66,15 +67,33 @@ func TestLabelMatchersToBloomTest(t *testing.T) {
match: false,
},
{
name: "filter series label with different value",
query: `{app="fake"} | app="noexist"`,
name: "ignore label from series",
query: `{app="fake"} | env="prod"`,
match: true,
},
{
name: "filter label from series",
query: `{app="fake"} | env="dev"`, // env is set to prod in the series
match: false,
},
// We cannot support this test case until we can forward a list of structured metadata fields.
// We cannot check if the key is structured metadata using the bloom because these are probabilistic
// E.g. bloom.Test("env") may return true even if env is not structured metadata.
//{
// name: "filter label from series overridden by structured metadata",
// query: `{app="fake"} | app="fake"`, // app is set to other in the structured metadata
// match: false,
//},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I'm trying to understand this.

Shouldn't {app="fake"} | app="fake" only return false if every single log line in a chunk has app as structured metadata (and none of them are fake)? I'm not sure knowing the list of fields alone would be enough to ensure we're not filtering out chunks incorrectly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you are right. I overthought this one too much.

{
name: "ignore label from series",
query: `{app="fake"} | app="fake"`,
name: "ignore label from series and structured metadata",
query: `{app="fake"} | app="other"`,
match: true,
},
{
name: "filter series label with non-existing value",
query: `{app="fake"} | app="noexist"`,
match: false,
},
{
name: "ignore label from series with empty value",
query: `{app="fake"} | app=""`,
Expand Down
Loading