diff --git a/src/regexp/syntax/doc.go b/src/regexp/syntax/doc.go index eb8a971c734874..abc58bb15924e0 100644 --- a/src/regexp/syntax/doc.go +++ b/src/regexp/syntax/doc.go @@ -7,12 +7,12 @@ /* Package syntax parses regular expressions into parse trees and compiles parse trees into programs. Most clients of regular expressions will use the -facilities of package regexp (such as Compile and Match) instead of this package. +facilities of package [regexp] (such as [regexp.Compile] and [regexp.Match]) instead of this package. # Syntax -The regular expression syntax understood by this package when parsing with the Perl flag is as follows. -Parts of the syntax can be disabled by passing alternate flags to Parse. +The regular expression syntax understood by this package when parsing with the [Perl] flag is as follows. +Parts of the syntax can be disabled by passing alternate flags to [Parse]. Single characters: @@ -137,6 +137,6 @@ ASCII character classes: [[:word:]] word characters (== [0-9A-Za-z_]) [[:xdigit:]] hex digit (== [0-9A-Fa-f]) -Unicode character classes are those in unicode.Categories and unicode.Scripts. +Unicode character classes are those in [unicode.Categories] and [unicode.Scripts]. */ package syntax diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go index 6a11b53fb1806d..18ee5112034268 100644 --- a/src/regexp/syntax/parse.go +++ b/src/regexp/syntax/parse.go @@ -1255,17 +1255,33 @@ Loop: return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} } -// isValidCaptureName reports whether name -// is a valid capture name: [A-Za-z0-9_]+. -// PCRE limits names to 32 bytes. -// Python rejects names starting with digits. -// We don't enforce either of those. +// Returns whether name is a valid capture name. func isValidCaptureName(name string) bool { if name == "" { return false } + + // Historically, we effectively used [0-9A-Za-z_]+ to validate; that + // followed Python 2 except for not restricting the first character. + // As of Python 3, Unicode characters beyond ASCII are also allowed; + // accordingly, we permit the Lu, Ll, Lt, Lm, Lo, Nl, Mn, Mc, Nd and + // Pc categories, but again without restricting the first character. + // Also, Unicode normalization (e.g. NFKC) isn't performed: Python 3 + // performs it for identifiers, but seemingly not for capture names; + // if they start doing that for capture names, we won't follow suit. for _, c := range name { - if c != '_' && !isalnum(c) { + if !unicode.In(c, + unicode.Lu, + unicode.Ll, + unicode.Lt, + unicode.Lm, + unicode.Lo, + unicode.Nl, + unicode.Mn, + unicode.Mc, + unicode.Nd, + unicode.Pc, + ) { return false } } diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go index 0f885bd5c8149f..5f143aaf527bf4 100644 --- a/src/regexp/syntax/parse_test.go +++ b/src/regexp/syntax/parse_test.go @@ -160,7 +160,9 @@ var parseTests = []parseTest{ // Test named captures {`(?Pa)`, `cap{name:lit{a}}`}, + {`(?P<中文>a)`, `cap{中文:lit{a}}`}, {`(?a)`, `cap{name:lit{a}}`}, + {`(?<中文>a)`, `cap{中文:lit{a}}`}, // Case-folded literals {`[Aa]`, `litfold{A}`},