open-telemetry · MrAlias · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
@@ -22,6 +22,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 - Fix inconsistent request body closing in `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp`. (#5954)
 - Fix inconsistent request body closing in `go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp`. (#5954)
 - Fix invalid exemplar keys in `go.opentelemetry.io/otel/exporters/prometheus`. (#5995)
+- Fix attribute value truncation in `go.opentelemetry.io/otel/sdk/trace`. (#5997)
 
 <!-- Released section -->
 <!-- Don't change this section unless doing release -->

@@ -347,47 +347,92 @@ func truncateAttr(limit int, attr attribute.KeyValue) attribute.KeyValue {
 	}
 	switch attr.Value.Type() {
 	case attribute.STRING:
-		if v := attr.Value.AsString(); len(v) > limit {
-			return attr.Key.String(safeTruncate(v, limit))
-		}
+		v := attr.Value.AsString()
+		return attr.Key.String(truncate(limit, v))
 	case attribute.STRINGSLICE:
 		v := attr.Value.AsStringSlice()
 		for i := range v {
-			if len(v[i]) > limit {
-				v[i] = safeTruncate(v[i], limit)
-			}
+			v[i] = truncate(limit, v[i])
 		}
 		return attr.Key.StringSlice(v)
 	}
 	return attr
 }
 
-// safeTruncate truncates the string and guarantees valid UTF-8 is returned.
-func safeTruncate(input string, limit int) string {
-	if trunc, ok := safeTruncateValidUTF8(input, limit); ok {
-		return trunc
+// truncate returns a truncated version of s such that it contains less than
+// the limit number of characters. Truncation is applied by returning the limit
+// number of valid characters contained in s.
+//
+// If limit is negative, it returns the original string.
+//
+// UTF-8 is supported. When truncating, all invalid characters are dropped
+// before applying truncation.
+//
+// If s already contains less than the limit number of bytes, it is returned
+// unchanged. No invalid characters are removed.
+func truncate(limit int, s string) string {
+	// This prioritize performance in the following order based on the most
+	// common expected use-cases.
+	//
+	//  - Short values less than the default limit (128).
+	//  - Strings with valid encodings that exceed the limit.
+	//  - No limit.
+	//  - Strings with invalid encodings that exceed the limit.
+	if limit < 0 || len(s) <= limit {
+		return s
+	}
+
+	// Optimistically, assume all valid UTF-8.
+	var b strings.Builder
+	count := 0
+	for i, c := range s {
+		if c != utf8.RuneError {
+			count++
+			if count > limit {
+				return s[:i]
+			}
+			continue
+		}
+
+		_, size := utf8.DecodeRuneInString(s[i:])
+		if size == 1 {
+			// Invalid encoding.
+			b.Grow(len(s) - 1)
+			_, _ = b.WriteString(s[:i])
+			s = s[i:]
+			break
+		}
+	}
+
+	// Fast-path, no invalid input.
+	if b.Cap() == 0 {
+		return s
 	}
-	trunc, _ := safeTruncateValidUTF8(strings.ToValidUTF8(input, ""), limit)
-	return trunc
-}
 
-// safeTruncateValidUTF8 returns a copy of the input string safely truncated to
-// limit. The truncation is ensured to occur at the bounds of complete UTF-8
-// characters. If invalid encoding of UTF-8 is encountered, input is returned
-// with false, otherwise, the truncated input will be returned with true.
-func safeTruncateValidUTF8(input string, limit int) (string, bool) {
-	for cnt := 0; cnt <= limit; {
-		r, size := utf8.DecodeRuneInString(input[cnt:])
-		if r == utf8.RuneError {
-			return input, false
+	// Truncate while validating UTF-8.
+	for i := 0; i < len(s) && count < limit; {
+		c := s[i]
+		if c < utf8.RuneSelf {
+			// Optimization for single byte runes (common case).
+			_ = b.WriteByte(c)
+			i++
+			count++
+			continue
 		}
 
-		if cnt+size > limit {
-			return input[:cnt], true
+		_, size := utf8.DecodeRuneInString(s[i:])
+		if size == 1 {
+			// We checked for all 1-byte runes above, this is a RuneError.
+			i++
+			continue
 		}
-		cnt += size
+
+		_, _ = b.WriteString(s[i : i+size])
+		i += size
+		count++
 	}
-	return input, true
+
+	return b.String()
 }
 
 // End ends the span. This method does nothing if the span is already ended or

@@ -183,7 +183,7 @@ func TestSpanLimits(t *testing.T) {
 		// Ensure string and string slice attributes are truncated.
 		assert.Contains(t, attrs, attribute.String("string", "ab"))
 		assert.Contains(t, attrs, attribute.StringSlice("stringSlice", []string{"ab", "de"}))
-		assert.Contains(t, attrs, attribute.String("euro", ""))
+		assert.Contains(t, attrs, attribute.String("euro", "€"))
 
 		limits.AttributeValueLengthLimit = 0
 		attrs = testSpanLimits(t, limits).Attributes()

@@ -201,38 +201,158 @@ func TestTruncateAttr(t *testing.T) {
 			attr:  strSliceAttr,
 			want:  strSliceAttr,
 		},
+	}
+
+	for _, test := range tests {
+		name := fmt.Sprintf("%s->%s(limit:%d)", test.attr.Key, test.attr.Value.Emit(), test.limit)
+		t.Run(name, func(t *testing.T) {
+			assert.Equal(t, test.want, truncateAttr(test.limit, test.attr))
+		})
+	}
+}
+
+func TestTruncate(t *testing.T) {
+	type group struct {
+		limit    int
+		input    string
+		expected string
+	}
+
+	tests := []struct {
+		name   string
+		groups []group
+	}{
+		// Edge case: limit is negative, no truncation should occur
+		{
+			name: "NoTruncation",
+			groups: []group{
+				{-1, "No truncation!", "No truncation!"},
+			},
+		},
+
+		// Edge case: string is already shorter than the limit, no truncation
+		// should occur
 		{
-			// This tests the ordinary safeTruncate().
-			limit: 10,
-			attr:  attribute.String(key, "€€€€"), // 3 bytes each
-			want:  attribute.String(key, "€€€"),
+			name: "ShortText",
+			groups: []group{
+				{10, "Short text", "Short text"},
+				{15, "Short text", "Short text"},
+				{100, "Short text", "Short text"},
+			},
 		},
+
+		// Edge case: truncation happens with ASCII characters only
 		{
-			// This tests truncation with an invalid UTF-8 input.
-			//
-			// Note that after removing the invalid rune,
-			// the string is over length and still has to
-			// be truncated on a code point boundary.
-			limit: 10,
-			attr:  attribute.String(key, "€"[0:2]+"hello€€"), // corrupted first rune, then over limit
-			want:  attribute.String(key, "hello€"),
+			name: "ASCIIOnly",
+			groups: []group{
+				{1, "Hello World!", "H"},
+				{5, "Hello World!", "Hello"},
+				{12, "Hello World!", "Hello World!"},
+			},
 		},
+
+		// Truncation including multi-byte characters (UTF-8)
 		{
-			// This tests the fallback to invalidTruncate()
-			// where after validation the string does not require
-			// truncation.
-			limit: 6,
-			attr:  attribute.String(key, "€"[0:2]+"hello"), // corrupted first rune, then not over limit
-			want:  attribute.String(key, "hello"),
+			name: "ValidUTF-8",
+			groups: []group{
+				{7, "Hello, 世界", "Hello, "},
+				{8, "Hello, 世界", "Hello, 世"},
+				{2, "こんにちは", "こん"},
+				{3, "こんにちは", "こんに"},
+				{5, "こんにちは", "こんにちは"},
+				{12, "こんにちは", "こんにちは"},
+			},
+		},
+
+		// Truncation with invalid UTF-8 characters
+		{
+			name: "InvalidUTF-8",
+			groups: []group{
+				{11, "Invalid\x80text", "Invalidtext"},
+				// Do not modify invalid text if equal to limit.
+				{11, "Valid text\x80", "Valid text\x80"},
+				// Do not modify invalid text if under limit.
+				{15, "Valid text\x80", "Valid text\x80"},
+				{5, "Hello\x80World", "Hello"},
+				{11, "Hello\x80World\x80!", "HelloWorld!"},
+				{15, "Hello\x80World\x80Test", "HelloWorldTest"},
+				{15, "Hello\x80\x80\x80World\x80Test", "HelloWorldTest"},
+				{15, "\x80\x80\x80Hello\x80\x80\x80World\x80Test\x80\x80", "HelloWorldTest"},
+			},
+		},
+
+		// Truncation with mixed validn and invalid UTF-8 characters
+		{
+			name: "MixedUTF-8",
+			groups: []group{
+				{6, "€"[0:2] + "hello€€", "hello€"},
+				{6, "€" + "€"[0:2] + "hello", "€hello"},
+				{11, "Valid text\x80📜", "Valid text📜"},
+				{11, "Valid text📜\x80", "Valid text📜"},
+				{14, "😊 Hello\x80World🌍🚀", "😊 HelloWorld🌍🚀"},
+				{14, "😊\x80 Hello\x80World🌍🚀", "😊 HelloWorld🌍🚀"},
+				{14, "😊\x80 Hello\x80World🌍\x80🚀", "😊 HelloWorld🌍🚀"},
+				{14, "😊\x80 Hello\x80World🌍\x80🚀\x80", "😊 HelloWorld🌍🚀"},
+				{14, "\x80😊\x80 Hello\x80World🌍\x80🚀\x80", "😊 HelloWorld🌍🚀"},
+			},
+		},
+
+		// Edge case: empty string, should return empty string
+		{
+			name: "Empty",
+			groups: []group{
+				{5, "", ""},
+			},
+		},
+
+		// Edge case: limit is 0, should return an empty string
+		{
+			name: "Zero",
+			groups: []group{
+				{0, "Some text", ""},
+				{0, "", ""},
+			},
 		},
 	}
 
-	for _, test := range tests {
-		name := fmt.Sprintf("%s->%s(limit:%d)", test.attr.Key, test.attr.Value.Emit(), test.limit)
-		t.Run(name, func(t *testing.T) {
-			assert.Equal(t, test.want, truncateAttr(test.limit, test.attr))
-		})
+	for _, tt := range tests {
+		for _, g := range tt.groups {
+			t.Run(tt.name, func(t *testing.T) {
+				t.Parallel()
+
+				got := truncate(g.limit, g.input)
+				assert.Equalf(
+					t, g.expected, got,
+					"input: %q([]rune%v))\ngot: %q([]rune%v)\nwant %q([]rune%v)",
+					g.input, []rune(g.input),
+					got, []rune(got),
+					g.expected, []rune(g.expected),
+				)
+			})
+		}
+	}
+}
+
+func BenchmarkTruncate(b *testing.B) {
+	run := func(limit int, input string) func(b *testing.B) {
+		return func(b *testing.B) {
+			b.ReportAllocs()
+			b.RunParallel(func(pb *testing.PB) {
+				var out string
+				for pb.Next() {
+					out = truncate(limit, input)
+				}
+				_ = out
+			})
+		}
 	}
+	b.Run("Unlimited", run(-1, "hello 😊 world 🌍🚀"))
+	b.Run("Zero", run(0, "Some text"))
+	b.Run("Short", run(10, "Short Text"))
+	b.Run("ASCII", run(5, "Hello, World!"))
+	b.Run("ValidUTF-8", run(10, "hello 😊 world 🌍🚀"))
+	b.Run("InvalidUTF-8", run(6, "€"[0:2]+"hello€€"))
+	b.Run("MixedUTF-8", run(14, "\x80😊\x80 Hello\x80World🌍\x80🚀\x80"))
 }
 
 func TestLogDropAttrs(t *testing.T) {