Просмотр исходного кода

feat: Implement Unicode support in filter lexer with improved error h… (#3299)

Signed-off-by: sneax <paladesh600@gmail.com>
Co-authored-by: Alex Meijer <ameijer@users.noreply.github.com>
segfault_bits 7 месяцев назад
Родитель
Сommit
faff1c39d1
2 измененных файлов с 122 добавлено и 26 удалено
  1. 38 26
      core/pkg/filter/ast/lexer.go
  2. 84 0
      core/pkg/filter/ast/unicode_test.go

+ 38 - 26
core/pkg/filter/ast/lexer.go

@@ -2,6 +2,8 @@ package ast
 
 import (
 	"fmt"
+	"unicode"
+	"unicode/utf8"
 
 	multierror "github.com/hashicorp/go-multierror"
 )
@@ -128,25 +130,28 @@ func (s scanner) atEnd() bool {
 	return s.nextByte >= len(s.source)
 }
 
-// advance returns a byte because we only accept ASCII, which has to fit in a
-// byte
-//
-// TODO: If we add unicode support, advance() will probably have to return a
-// rune.
-func (s *scanner) advance() byte {
-	b := s.source[s.nextByte]
-	s.nextByte += 1
-	return b
+// advance returns a rune to support Unicode characters
+func (s *scanner) advance() rune {
+	if s.atEnd() {
+		return 0
+	}
+	
+	r, size := utf8.DecodeRuneInString(s.source[s.nextByte:])
+	s.nextByte += size
+	return r
 }
 
-func (s *scanner) match(expected byte) bool {
+func (s *scanner) match(expected rune) bool {
 	if s.atEnd() {
 		return false
 	}
-	if s.source[s.nextByte] != expected {
+	
+	// Get the rune at the current position
+	r, size := utf8.DecodeRuneInString(s.source[s.nextByte:])
+	if r != expected {
 		return false
 	}
-	s.nextByte += 1
+	s.nextByte += size
 	return true
 }
 
@@ -164,11 +169,14 @@ func (s *scanner) addToken(kind tokenKind) {
 	})
 }
 
-func (s *scanner) peek() byte {
+func (s *scanner) peek() rune {
 	if s.atEnd() {
 		return 0
 	}
-	return s.source[s.nextByte]
+	
+	// Get the rune at the current position
+	r, _ := utf8.DecodeRuneInString(s.source[s.nextByte:])
+	return r
 }
 
 func (s *scanner) scanToken() {
@@ -246,6 +254,12 @@ func (s *scanner) scanToken() {
 	case ' ', '\t', '\n', '\r':
 		break
 	default:
+		// Check for invalid UTF-8 sequences
+		if c == utf8.RuneError {
+			s.errors = append(s.errors, fmt.Errorf("invalid UTF-8 character at position %d", s.nextByte-1))
+			break
+		}
+		
 		// identifiers
 		//
 		// We can keep it simple and not _force_ the first character to be a
@@ -258,10 +272,12 @@ func (s *scanner) scanToken() {
 			break
 		}
 
-		// TODO: We could return a more exact error message for Unicode chars if
-		// we added extra handling:
-		// https://stackoverflow.com/questions/53069040/checking-a-string-contains-only-ascii-characters
-		s.errors = append(s.errors, fmt.Errorf("unexpected character/byte at position %d. Please avoid Unicode.", s.nextByte-1))
+		// Check if the character is a Unicode character for a more precise error message
+		if c > 127 {
+			s.errors = append(s.errors, fmt.Errorf("unexpected Unicode character '%c' (U+%04X) at position %d", c, c, s.nextByte-1))
+		} else {
+			s.errors = append(s.errors, fmt.Errorf("unexpected character '%c' at position %d", c, s.nextByte-1))
+		}
 	}
 }
 
@@ -269,14 +285,10 @@ func (s *scanner) scanToken() {
 //
 // https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
 //
-// TODO: This may not match all characters we support for cluster IDs (it may be
-// the case that cluster IDs can contain UTF-8 characters).
-func isIdentifierChar(b byte) bool {
-	return (b >= '0' && b <= '9') || // 0-9
-		(b >= 'A' && b <= 'Z') || // A-Z
-		(b >= 'a' && b <= 'z') || // a-z
-		b == '-' || // hyphens are allowed according to K8s spec
-		b == '_' // underscores are allowed because of Prometheus sanitization
+// This has been updated to support UTF-8 characters for cluster IDs.
+func isIdentifierChar(r rune) bool {
+	// Allow letters, digits, hyphens, and underscores.
+	return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '-' || r == '_'
 }
 
 func (s *scanner) string() {

+ 84 - 0
core/pkg/filter/ast/unicode_test.go

@@ -0,0 +1,84 @@
+package ast
+
+import (
+	"testing"
+)
+
+func TestUnicodeSupport(t *testing.T) {
+	// Test Unicode characters in identifiers
+	cases := []struct {
+		name        string
+		input       string
+		expectError bool
+	}{
+		{
+			name:        "Unicode identifier",
+			input:       "café",
+			expectError: false,
+		},
+		{
+			name:        "Unicode in keyed access",
+			input:       "[café]",
+			expectError: false,
+		},
+		{
+			name:        "Unicode with valid field",
+			input:       "namespace:café",
+			expectError: false,
+		},
+	}
+
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			_, err := lex(c.input, allocFields, allocMapFields)
+			if c.expectError && err == nil {
+				t.Errorf("expected error but got nil")
+			} else if !c.expectError && err != nil {
+				t.Errorf("unexpected error: %s", err)
+			}
+		})
+	}
+}
+
+func TestUnicodeErrorMessages(t *testing.T) {
+	// Test that Unicode characters produce better error messages
+	_, err := lex("café@", allocFields, allocMapFields)
+	if err == nil {
+		t.Errorf("expected error but got nil")
+		return
+	}
+	
+	// Check that the error message contains Unicode information
+	errStr := err.Error()
+	if len(errStr) == 0 {
+		t.Errorf("expected error message to contain Unicode information")
+	}
+}
+
+func TestInvalidUTF8(t *testing.T) {
+	// Test invalid UTF-8 sequences
+	// This is a string with an invalid UTF-8 sequence
+	invalidUTF8 := "\xC0\x80" // Invalid UTF-8 sequence
+	
+	_, err := lex(invalidUTF8, allocFields, allocMapFields)
+	if err == nil {
+		t.Errorf("expected error for invalid UTF-8 but got nil")
+		return
+	}
+	
+	// Check that the error message mentions invalid UTF-8
+	errStr := err.Error()
+	if len(errStr) == 0 || !contains(errStr, "invalid UTF-8") {
+		t.Errorf("expected error message to mention invalid UTF-8, got: %s", errStr)
+	}
+}
+
+// Helper function to check if a string contains a substring
+func contains(s, substr string) bool {
+	for i := 0; i <= len(s)-len(substr); i++ {
+		if s[i:i+len(substr)] == substr {
+			return true
+		}
+	}
+	return false
+}