golang · igorzhilianin · Dec 11, 2023
diff --git a/src/regexp/syntax/doc.go b/src/regexp/syntax/doc.go
@@ -7,12 +7,12 @@
 /*
 Package syntax parses regular expressions into parse trees and compiles
 parse trees into programs. Most clients of regular expressions will use the
-facilities of package regexp (such as Compile and Match) instead of this package.
+facilities of package [regexp] (such as [regexp.Compile] and [regexp.Match]) instead of this package.
 
 # Syntax
 
-The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
-Parts of the syntax can be disabled by passing alternate flags to Parse.
+The regular expression syntax understood by this package when parsing with the [Perl] flag is as follows.
+Parts of the syntax can be disabled by passing alternate flags to [Parse].
 
 Single characters:
 
@@ -137,6 +137,6 @@ ASCII character classes:
 	[[:word:]]     word characters (== [0-9A-Za-z_])
 	[[:xdigit:]]   hex digit (== [0-9A-Fa-f])
 
-Unicode character classes are those in unicode.Categories and unicode.Scripts.
+Unicode character classes are those in [unicode.Categories] and [unicode.Scripts].
 */
 package syntax
diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go
@@ -1255,17 +1255,33 @@ Loop:
 	return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
 }
 
-// isValidCaptureName reports whether name
-// is a valid capture name: [A-Za-z0-9_]+.
-// PCRE limits names to 32 bytes.
-// Python rejects names starting with digits.
-// We don't enforce either of those.
+// Returns whether name is a valid capture name.
 func isValidCaptureName(name string) bool {
 	if name == "" {
 		return false
 	}
+
+	// Historically, we effectively used [0-9A-Za-z_]+ to validate; that
+	// followed Python 2 except for not restricting the first character.
+	// As of Python 3, Unicode characters beyond ASCII are also allowed;
+	// accordingly, we permit the Lu, Ll, Lt, Lm, Lo, Nl, Mn, Mc, Nd and
+	// Pc categories, but again without restricting the first character.
+	// Also, Unicode normalization (e.g. NFKC) isn't performed: Python 3
+	// performs it for identifiers, but seemingly not for capture names;
+	// if they start doing that for capture names, we won't follow suit.
 	for _, c := range name {
-		if c != '_' && !isalnum(c) {
+		if !unicode.In(c,
+			unicode.Lu,
+			unicode.Ll,
+			unicode.Lt,
+			unicode.Lm,
+			unicode.Lo,
+			unicode.Nl,
+			unicode.Mn,
+			unicode.Mc,
+			unicode.Nd,
+			unicode.Pc,
+		) {
 			return false
 		}
 	}

diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go
@@ -160,7 +160,9 @@ var parseTests = []parseTest{
 
 	// Test named captures
 	{`(?P<name>a)`, `cap{name:lit{a}}`},
+	{`(?P<中文>a)`, `cap{中文:lit{a}}`},
 	{`(?<name>a)`, `cap{name:lit{a}}`},
+	{`(?<中文>a)`, `cap{中文:lit{a}}`},
 
 	// Case-folded literals
 	{`[Aa]`, `litfold{A}`},