feat: improve placeholder replacement of byte sizes (#13508)

na-- · web-flow · commit ac284ca00ed0 · 2024-07-12T09:20:22.000-06:00
diff --git a/pkg/pattern/tokenization/replacer.go b/pkg/pattern/tokenization/replacer.go
@@ -314,6 +314,10 @@ restore: // should be faster than a defer
 	return false
 }
 
+// 'b' and 'B' are not present here because of the way we check for byte size
+// units below. If they were present, then suffixes like 'Bb', 'bb', etc. would
+// be considered valid byte sizes. Also, only integer numbers are accepted as
+// valid bytesizes in bytes, so we handle bytes with special cases instead.
 var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true}
 
 // Only moves the head forward if it successfully matches a duration
@@ -339,6 +343,22 @@ func (r *replacer) advanceBytesize(c1 byte) (matched bool) {
 	return false
 }
 
+func (r *replacer) advanceSpacedBytesize(canBeBytes bool) (matched bool) {
+	// Get the next character after the space
+	c1, hasNext := r.advance()
+	if !hasNext {
+		return false
+	}
+	if canBeBytes && (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() {
+		return true
+	}
+	if r.advanceBytesize(c1) {
+		return true
+	}
+	r.backtrack()
+	return false
+}
+
 func (r *replacer) advance() (c byte, advanced bool) {
 	if r.head >= len(r.source) {
 		return 0, false
@@ -394,6 +414,14 @@ func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (e
 		c1 = r.peekFirstNonInt()
 	}
 
+	// Special case, this might be a byte size
+	if (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() {
+		// We do not subsume a minus sign - byte sizes are unlikely to be
+		// negative, it's more likely this is a dash as a part of a range
+		r.emit(hasMinusPrefix, placeholderBytesize)
+		return true
+	}
+
 	// Maybe we are at the start of a hex string, either something like
 	// "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper
 	// case letters, but to avoid false positives, we want hex replacements to
@@ -489,6 +517,14 @@ func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint
 		return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2)
 	}
 
+	// This can be a byte size with a space, e.g. "3.14 GiB"
+	if b2 == ' ' && r.advanceSpacedBytesize(false) {
+		// We do not subsume a minus sign - byte sizes are unlikely to be
+		// negative, it's more likely this is a dash as a part of a range
+		r.emit(hasMinusPrefix, placeholderBytesize)
+		return true
+	}
+
 	// We have a decimal number followed by a non-dot boundary, so this is not
 	// an IP or a version number or anything like that.
 	if b2 != '.' {
@@ -633,6 +669,11 @@ func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool
 	case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'):
 		return r.handleSaneTimestamp(hasMinusPrefix, n1, b1)
 
+	// This might be a byte size with a space, e.g. "2 b", "3 GiB"
+	case b1 == ' ' && r.advanceSpacedBytesize(true):
+		r.emit(hasMinusPrefix, placeholderBytesize)
+		return true
+
 	// Weird RFC822 dates like "02 Jan 06 15:04 MST"
 	case n1 <= 31 && l1 <= 2 && b1 == ' ':
 		if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) {
diff --git a/pkg/pattern/tokenization/tokenization_test.go b/pkg/pattern/tokenization/tokenization_test.go
@@ -145,8 +145,11 @@ var tokenizationCornerTestCases = []tokenizationTestCase{
 		[]string{"<NUM>.<DURATION>", "3h121m3.<DURATION>", "1h0.<DURATION>", "100usa", "0.12msa"},
 	},
 	{
-		"2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit",
-		[]string{"<BYTESIZE>", "<BYTESIZE>-<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>:<BYTESIZE>"},
+		// We only consider integers to be valid bytesizes in bytes (0.2B doesn't make sense)
+		"2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit 5 B;124.1 KB/3b - 2b or 2 BeNot 13.37 b 3 b",
+		[]string{
+			"<BYTESIZE>", "<BYTESIZE>-<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>:<BYTESIZE>",
+			"<BYTESIZE>;<BYTESIZE>/<BYTESIZE>", "-", "<BYTESIZE>", "or", "<NUM>", "BeNot", "<NUM>", "b", "<BYTESIZE>"},
 	},
 	{
 		`status=123 status_code:500 status 200 status="-1" status_code:"404" httpStatus=200`,
@@ -175,6 +178,13 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{
 			"level=debug", "ts=<TIMESTAMP>", "caller=shard_resolver.go:<NUM>", "bytes=<BYTESIZE>", "chunks=<NUM>", "streams=<NUM>", "entries=<NUM>", `msg="queried index"`, "type=single", `matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}"`, "duration=<DURATION>", "from=<TIMESTAMP>", "through=<TIMESTAMP>", "length=<DURATION>",
 		},
 	},
+	// tricky loki distributor message:
+	{
+		`level=debug ts=2024-07-12T12:25:06.175464934Z caller=push.go:146 org_id=29 traceID=7af4f918eab1c80f msg="push request parsed" path=/loki/api/v1/push contentType=application/x-protobuf contentEncoding= bodySize="8.8 kB" streams=11 entries=43 streamLabelsSize="3.4 kB" entriesSize="19 kB" structuredMetadataSize="71 B" totalSize="22 kB" mostRecentLagMs=167 adaptiveLogsDroppedLines=10 adaptiveLogsDroppedSize=4965 adaptiveLogsMatchedLines=37`,
+		[]string{
+			"level=debug", "ts=<TIMESTAMP>", "caller=push.go:<NUM>", "org_id=<NUM>", "traceID=<HEX>", `msg="push request parsed"`, "path=/loki/api/v1/push", "contentType=application/x-protobuf", "contentEncoding=", `bodySize="<BYTESIZE>"`, "streams=<NUM>", "entries=<NUM>", `streamLabelsSize="<BYTESIZE>"`, `entriesSize="<BYTESIZE>"`, `structuredMetadataSize="<BYTESIZE>"`, `totalSize="<BYTESIZE>"`, "mostRecentLagMs=<NUM>", "adaptiveLogsDroppedLines=<NUM>", "adaptiveLogsDroppedSize=<NUM>", "adaptiveLogsMatchedLines=<NUM>",
+		},
+	},
 	// random JSON logs
 	{
 		`{"timestamp": "2022-12-23T12:34:56Z", "level": "debug", "message": "Server starting", "server_id": "abcdefghij", "start_time": "2022-12-23T12:30:00Z"}`,