Skip to content

Commit ac284ca

Browse files
authored
feat: improve placeholder replacement of byte sizes (#13508)
1 parent 07c3c76 commit ac284ca

File tree

2 files changed

+53
-2
lines changed

2 files changed

+53
-2
lines changed

pkg/pattern/tokenization/replacer.go

+41
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,10 @@ restore: // should be faster than a defer
314314
return false
315315
}
316316

317+
// 'b' and 'B' are not present here because of the way we check for byte size
318+
// units below. If they were present, then suffixes like 'Bb', 'bb', etc. would
319+
// be considered valid byte sizes. Also, only integer numbers are accepted as
320+
// valid bytesizes in bytes, so we handle bytes with special cases instead.
317321
var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true}
318322

319323
// Only moves the head forward if it successfully matches a duration
@@ -339,6 +343,22 @@ func (r *replacer) advanceBytesize(c1 byte) (matched bool) {
339343
return false
340344
}
341345

346+
func (r *replacer) advanceSpacedBytesize(canBeBytes bool) (matched bool) {
347+
// Get the next character after the space
348+
c1, hasNext := r.advance()
349+
if !hasNext {
350+
return false
351+
}
352+
if canBeBytes && (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() {
353+
return true
354+
}
355+
if r.advanceBytesize(c1) {
356+
return true
357+
}
358+
r.backtrack()
359+
return false
360+
}
361+
342362
func (r *replacer) advance() (c byte, advanced bool) {
343363
if r.head >= len(r.source) {
344364
return 0, false
@@ -394,6 +414,14 @@ func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (e
394414
c1 = r.peekFirstNonInt()
395415
}
396416

417+
// Special case, this might be a byte size
418+
if (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() {
419+
// We do not subsume a minus sign - byte sizes are unlikely to be
420+
// negative, it's more likely this is a dash as a part of a range
421+
r.emit(hasMinusPrefix, placeholderBytesize)
422+
return true
423+
}
424+
397425
// Maybe we are at the start of a hex string, either something like
398426
// "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper
399427
// case letters, but to avoid false positives, we want hex replacements to
@@ -489,6 +517,14 @@ func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint
489517
return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2)
490518
}
491519

520+
// This can be a byte size with a space, e.g. "3.14 GiB"
521+
if b2 == ' ' && r.advanceSpacedBytesize(false) {
522+
// We do not subsume a minus sign - byte sizes are unlikely to be
523+
// negative, it's more likely this is a dash as a part of a range
524+
r.emit(hasMinusPrefix, placeholderBytesize)
525+
return true
526+
}
527+
492528
// We have a decimal number followed by a non-dot boundary, so this is not
493529
// an IP or a version number or anything like that.
494530
if b2 != '.' {
@@ -633,6 +669,11 @@ func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool
633669
case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'):
634670
return r.handleSaneTimestamp(hasMinusPrefix, n1, b1)
635671

672+
// This might be a byte size with a space, e.g. "2 b", "3 GiB"
673+
case b1 == ' ' && r.advanceSpacedBytesize(true):
674+
r.emit(hasMinusPrefix, placeholderBytesize)
675+
return true
676+
636677
// Weird RFC822 dates like "02 Jan 06 15:04 MST"
637678
case n1 <= 31 && l1 <= 2 && b1 == ' ':
638679
if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) {

pkg/pattern/tokenization/tokenization_test.go

+12-2
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,11 @@ var tokenizationCornerTestCases = []tokenizationTestCase{
145145
[]string{"<NUM>.<DURATION>", "3h121m3.<DURATION>", "1h0.<DURATION>", "100usa", "0.12msa"},
146146
},
147147
{
148-
"2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit",
149-
[]string{"<BYTESIZE>", "<BYTESIZE>-<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>:<BYTESIZE>"},
148+
// We only consider integers to be valid bytesizes in bytes (0.2B doesn't make sense)
149+
"2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit 5 B;124.1 KB/3b - 2b or 2 BeNot 13.37 b 3 b",
150+
[]string{
151+
"<BYTESIZE>", "<BYTESIZE>-<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>:<BYTESIZE>",
152+
"<BYTESIZE>;<BYTESIZE>/<BYTESIZE>", "-", "<BYTESIZE>", "or", "<NUM>", "BeNot", "<NUM>", "b", "<BYTESIZE>"},
150153
},
151154
{
152155
`status=123 status_code:500 status 200 status="-1" status_code:"404" httpStatus=200`,
@@ -175,6 +178,13 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{
175178
"level=debug", "ts=<TIMESTAMP>", "caller=shard_resolver.go:<NUM>", "bytes=<BYTESIZE>", "chunks=<NUM>", "streams=<NUM>", "entries=<NUM>", `msg="queried index"`, "type=single", `matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}"`, "duration=<DURATION>", "from=<TIMESTAMP>", "through=<TIMESTAMP>", "length=<DURATION>",
176179
},
177180
},
181+
// tricky loki distributor message:
182+
{
183+
`level=debug ts=2024-07-12T12:25:06.175464934Z caller=push.go:146 org_id=29 traceID=7af4f918eab1c80f msg="push request parsed" path=/loki/api/v1/push contentType=application/x-protobuf contentEncoding= bodySize="8.8 kB" streams=11 entries=43 streamLabelsSize="3.4 kB" entriesSize="19 kB" structuredMetadataSize="71 B" totalSize="22 kB" mostRecentLagMs=167 adaptiveLogsDroppedLines=10 adaptiveLogsDroppedSize=4965 adaptiveLogsMatchedLines=37`,
184+
[]string{
185+
"level=debug", "ts=<TIMESTAMP>", "caller=push.go:<NUM>", "org_id=<NUM>", "traceID=<HEX>", `msg="push request parsed"`, "path=/loki/api/v1/push", "contentType=application/x-protobuf", "contentEncoding=", `bodySize="<BYTESIZE>"`, "streams=<NUM>", "entries=<NUM>", `streamLabelsSize="<BYTESIZE>"`, `entriesSize="<BYTESIZE>"`, `structuredMetadataSize="<BYTESIZE>"`, `totalSize="<BYTESIZE>"`, "mostRecentLagMs=<NUM>", "adaptiveLogsDroppedLines=<NUM>", "adaptiveLogsDroppedSize=<NUM>", "adaptiveLogsMatchedLines=<NUM>",
186+
},
187+
},
178188
// random JSON logs
179189
{
180190
`{"timestamp": "2022-12-23T12:34:56Z", "level": "debug", "message": "Server starting", "server_id": "abcdefghij", "start_time": "2022-12-23T12:30:00Z"}`,

0 commit comments

Comments
 (0)