пре 7 месеци · faff1c39d1
--- a/core/pkg/filter/ast/lexer.go
+++ b/core/pkg/filter/ast/lexer.go
@@ -2,6 +2,8 @@ package ast
 
															 import (
														
 
															 	"fmt"
														
 
															+	"unicode"
														
 
															+	"unicode/utf8"
														
 
															 	multierror "github.com/hashicorp/go-multierror"
														
 
															 )
														
@@ -128,25 +130,28 @@ func (s scanner) atEnd() bool {
 
															 	return s.nextByte >= len(s.source)
														
 
															 }
														
 
															-// advance returns a byte because we only accept ASCII, which has to fit in a
														
 
															-// byte
														
 
															-//
														
 
															-// TODO: If we add unicode support, advance() will probably have to return a
														
 
															-// rune.
														
 
															-func (s *scanner) advance() byte {
														
 
															-	b := s.source[s.nextByte]
														
 
															-	s.nextByte += 1
														
 
															-	return b
														
 
															+// advance returns a rune to support Unicode characters
														
 
															+func (s *scanner) advance() rune {
														
 
															+	if s.atEnd() {
														
 
															+		return 0
														
 
															+	}
														
 
															+	
														
 
															+	r, size := utf8.DecodeRuneInString(s.source[s.nextByte:])
														
 
															+	s.nextByte += size
														
 
															+	return r
														
 
															 }
														
 
															-func (s *scanner) match(expected byte) bool {
														
 
															+func (s *scanner) match(expected rune) bool {
														
 
															 	if s.atEnd() {
														
 
															 		return false
														
 
															 	}
														
 
															-	if s.source[s.nextByte] != expected {
														
 
															+	
														
 
															+	// Get the rune at the current position
														
 
															+	r, size := utf8.DecodeRuneInString(s.source[s.nextByte:])
														
 
															+	if r != expected {
														
 
															 		return false
														
 
															 	}
														
 
															-	s.nextByte += 1
														
 
															+	s.nextByte += size
														
 
															 	return true
														
 
															 }
														
@@ -164,11 +169,14 @@ func (s *scanner) addToken(kind tokenKind) {
 
															 	})
														
 
															 }
														
 
															-func (s *scanner) peek() byte {
														
 
															+func (s *scanner) peek() rune {
														
 
															 	if s.atEnd() {
														
 
															 		return 0
														
 
															 	}
														
 
															-	return s.source[s.nextByte]
														
 
															+	
														
 
															+	// Get the rune at the current position
														
 
															+	r, _ := utf8.DecodeRuneInString(s.source[s.nextByte:])
														
 
															+	return r
														
 
															 }
														
 
															 func (s *scanner) scanToken() {
														
@@ -246,6 +254,12 @@ func (s *scanner) scanToken() {
 
															 	case ' ', '\t', '\n', '\r':
														
 
															 		break
														
 
															 	default:
														
 
															+		// Check for invalid UTF-8 sequences
														
 
															+		if c == utf8.RuneError {
														
 
															+			s.errors = append(s.errors, fmt.Errorf("invalid UTF-8 character at position %d", s.nextByte-1))
														
 
															+			break
														
 
															+		}
														
 
															+		
														
 
															 		// identifiers
														
 
															 		//
														
 
															 		// We can keep it simple and not _force_ the first character to be a
														
@@ -258,10 +272,12 @@ func (s *scanner) scanToken() {
 
															 			break
														
 
															 		}
														
 
															-		// TODO: We could return a more exact error message for Unicode chars if
														
 
															-		// we added extra handling:
														
 
															-		// https://stackoverflow.com/questions/53069040/checking-a-string-contains-only-ascii-characters
														
 
															-		s.errors = append(s.errors, fmt.Errorf("unexpected character/byte at position %d. Please avoid Unicode.", s.nextByte-1))
														
 
															+		// Check if the character is a Unicode character for a more precise error message
														
 
															+		if c > 127 {
														
 
															+			s.errors = append(s.errors, fmt.Errorf("unexpected Unicode character '%c' (U+%04X) at position %d", c, c, s.nextByte-1))
														
 
															+		} else {
														
 
															+			s.errors = append(s.errors, fmt.Errorf("unexpected character '%c' at position %d", c, s.nextByte-1))
														
 
															+		}
														
 
															 	}
														
 
															 }
														
@@ -269,14 +285,10 @@ func (s *scanner) scanToken() {
 
															 //
														
 
															 // https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
														
 
															 //
														
 
															-// TODO: This may not match all characters we support for cluster IDs (it may be
														
 
															-// the case that cluster IDs can contain UTF-8 characters).
														
 
															-func isIdentifierChar(b byte) bool {
														
 
															-	return (b >= '0' && b <= '9') || // 0-9
														
 
															-		(b >= 'A' && b <= 'Z') || // A-Z
														
 
															-		(b >= 'a' && b <= 'z') || // a-z
														
 
															-		b == '-' || // hyphens are allowed according to K8s spec
														
 
															-		b == '_' // underscores are allowed because of Prometheus sanitization
														
 
															+// This has been updated to support UTF-8 characters for cluster IDs.
														
 
															+func isIdentifierChar(r rune) bool {
														
 
															+	// Allow letters, digits, hyphens, and underscores.
														
 
															+	return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '-' || r == '_'
														
 
															 }
														
 
															 func (s *scanner) string() {
														
--- a/core/pkg/filter/ast/unicode_test.go
+++ b/core/pkg/filter/ast/unicode_test.go
@@ -0,0 +1,84 @@
 
															+package ast
														
 
															+
														
 
															+import (
														
 
															+	"testing"
														
 
															+)
														
 
															+
														
 
															+func TestUnicodeSupport(t *testing.T) {
														
 
															+	// Test Unicode characters in identifiers
														
 
															+	cases := []struct {
														
 
															+		name        string
														
 
															+		input       string
														
 
															+		expectError bool
														
 
															+	}{
														
 
															+		{
														
 
															+			name:        "Unicode identifier",
														
 
															+			input:       "café",
														
 
															+			expectError: false,
														
 
															+		},
														
 
															+		{
														
 
															+			name:        "Unicode in keyed access",
														
 
															+			input:       "[café]",
														
 
															+			expectError: false,
														
 
															+		},
														
 
															+		{
														
 
															+			name:        "Unicode with valid field",
														
 
															+			input:       "namespace:café",
														
 
															+			expectError: false,
														
 
															+		},
														
 
															+	}
														
 
															+
														
 
															+	for _, c := range cases {
														
 
															+		t.Run(c.name, func(t *testing.T) {
														
 
															+			_, err := lex(c.input, allocFields, allocMapFields)
														
 
															+			if c.expectError && err == nil {
														
 
															+				t.Errorf("expected error but got nil")
														
 
															+			} else if !c.expectError && err != nil {
														
 
															+				t.Errorf("unexpected error: %s", err)
														
 
															+			}
														
 
															+		})
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func TestUnicodeErrorMessages(t *testing.T) {
														
 
															+	// Test that Unicode characters produce better error messages
														
 
															+	_, err := lex("café@", allocFields, allocMapFields)
														
 
															+	if err == nil {
														
 
															+		t.Errorf("expected error but got nil")
														
 
															+		return
														
 
															+	}
														
 
															+	
														
 
															+	// Check that the error message contains Unicode information
														
 
															+	errStr := err.Error()
														
 
															+	if len(errStr) == 0 {
														
 
															+		t.Errorf("expected error message to contain Unicode information")
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func TestInvalidUTF8(t *testing.T) {
														
 
															+	// Test invalid UTF-8 sequences
														
 
															+	// This is a string with an invalid UTF-8 sequence
														
 
															+	invalidUTF8 := "\xC0\x80" // Invalid UTF-8 sequence
														
 
															+	
														
 
															+	_, err := lex(invalidUTF8, allocFields, allocMapFields)
														
 
															+	if err == nil {
														
 
															+		t.Errorf("expected error for invalid UTF-8 but got nil")
														
 
															+		return
														
 
															+	}
														
 
															+	
														
 
															+	// Check that the error message mentions invalid UTF-8
														
 
															+	errStr := err.Error()
														
 
															+	if len(errStr) == 0 || !contains(errStr, "invalid UTF-8") {
														
 
															+		t.Errorf("expected error message to mention invalid UTF-8, got: %s", errStr)
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+// Helper function to check if a string contains a substring
														
 
															+func contains(s, substr string) bool {
														
 
															+	for i := 0; i <= len(s)-len(substr); i++ {
														
 
															+		if s[i:i+len(substr)] == substr {
														
 
															+			return true
														
 
															+		}
														
 
															+	}
														
 
															+	return false
														
 
															+}