-
Notifications
You must be signed in to change notification settings - Fork 256
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add sources for new parser and tokenizer
- Loading branch information
Showing
2 changed files
with
325 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
from typing import Any, List, Tuple | ||
|
||
from ._tokenizer import Tokenizer | ||
|
||
|
||
def parse_named_requirement(requirement: str) -> Tuple[Any, str, Any, str, str]: | ||
""" | ||
NAMED_REQUIREMENT: NAME EXTRAS* URL_SPEC (SEMICOLON + MARKER)* | ||
""" | ||
tokens = Tokenizer(requirement) | ||
name = parse_identifier(tokens) | ||
extras = parse_extras(tokens) | ||
specifier = "" | ||
url = "" | ||
if tokens.match("URL_SPEC"): | ||
url = tokens.read().text[1:].strip() | ||
elif not tokens.match("stringEnd"): | ||
specifier = parse_specifier(tokens) | ||
if tokens.match("SEMICOLON"): | ||
marker = "" | ||
while not tokens.match("stringEnd"): | ||
# we don't validate markers here, it's done later as part of | ||
# packaging/requirements.py | ||
marker += tokens.read().text | ||
else: | ||
marker = "" | ||
tokens.expect("stringEnd") | ||
return (name, url, extras, specifier, marker) | ||
|
||
|
||
def parse_extras(tokens: Tokenizer) -> List[str]: | ||
""" | ||
EXTRAS: (LBRACKET + IDENTIFIER + (COLON + IDENTIFIER)* + RBRACKET)* | ||
""" | ||
extras = [] | ||
if tokens.try_read("LBRACKET"): | ||
while tokens.match("IDENTIFIER"): | ||
extras.append(parse_identifier(tokens)) | ||
tokens.try_read("COLON") | ||
if not tokens.try_read("RBRACKET"): | ||
tokens.raise_syntax_error("Closing square bracket is missing") | ||
return extras | ||
|
||
|
||
def parse_identifier(tokens: Tokenizer) -> Any: | ||
if tokens.match("IDENTIFIER"): | ||
return tokens.read().text.strip("'\"") | ||
else: | ||
return tokens.raise_syntax_error("Expected IDENTIFIER") | ||
|
||
|
||
def parse_specifier(tokens: Tokenizer) -> str: | ||
""" | ||
SPECIFIER: LPAREN (OP + VERSION + COLON)+ RPAREN | OP + VERSION | ||
""" | ||
parsed_specifiers = "" | ||
lparen = False | ||
if tokens.try_read("LPAREN"): | ||
lparen = True | ||
while tokens.match("OP"): | ||
parsed_specifiers += tokens.read("OP").text | ||
if tokens.match("VERSION"): | ||
parsed_specifiers += tokens.read("VERSION").text | ||
else: | ||
tokens.raise_syntax_error("Missing version") | ||
if tokens.match("COLON"): | ||
parsed_specifiers += tokens.read("COLON").text | ||
if lparen and not tokens.try_read("RPAREN"): | ||
tokens.raise_syntax_error("Closing right parenthesis is missing") | ||
return parsed_specifiers | ||
|
||
|
||
def parse_quoted_marker(tokens: Tokenizer) -> Any: | ||
tokens.try_read("SEMICOLON") | ||
return parse_marker_expr(tokens) | ||
|
||
|
||
def parse_marker_expr(tokens: Tokenizer) -> List[str]: | ||
""" | ||
MARKER_EXPR: MARKER_ATOM (BOOLOP + MARKER_ATOM)+ | ||
""" | ||
expression = [parse_marker_atom(tokens)] | ||
while tokens.match("BOOLOP"): | ||
tok = tokens.try_read("BOOLOP") | ||
expr_right = parse_marker_atom(tokens) | ||
expression.extend((tok.text, expr_right)) | ||
return expression | ||
|
||
|
||
def parse_marker_atom(tokens: Tokenizer) -> Any: | ||
""" | ||
MARKER_ATOM: LPAREN MARKER_EXPR RPAREN | MARKER_ITEM | ||
""" | ||
if tokens.try_read("LPAREN"): | ||
marker = parse_marker_expr(tokens) | ||
if not tokens.try_read("RPAREN"): | ||
tokens.raise_syntax_error("Closing right parenthesis is missing") | ||
return marker | ||
else: | ||
return parse_marker_item(tokens) | ||
|
||
|
||
def parse_marker_item(tokens: Tokenizer) -> Tuple[Any, Any, Any]: | ||
""" | ||
MARKER_ITEM: MARKER_VAR MARKER_OP MARKER_VAR | ||
""" | ||
marker_var_left = parse_marker_var(tokens) | ||
marker_op = parse_marker_op(tokens) | ||
marker_var_right = parse_marker_var(tokens) | ||
return (marker_var_left, marker_op, marker_var_right) | ||
|
||
|
||
def parse_marker_var(tokens: Tokenizer) -> Any: | ||
""" | ||
MARKER_VAR: VARIABLE MARKER_VALUE | ||
""" | ||
if tokens.match("VARIABLE"): | ||
return parse_variable(tokens) | ||
else: | ||
return parse_python_str(tokens) | ||
|
||
|
||
def parse_variable(tokens: Tokenizer) -> Any: | ||
from .markers import Variable | ||
|
||
env_var = tokens.read("VARIABLE").text.replace(".", "_") | ||
if ( | ||
env_var == "platform_python_implementation" | ||
or env_var == "python_implementation" | ||
): | ||
return Variable("platform_python_implementation") | ||
elif env_var == "platform_python_version": | ||
return Variable("python_full_version") | ||
elif env_var == "sys_implementation.name": | ||
return Variable("implementation_name") | ||
else: | ||
return Variable(env_var) | ||
|
||
|
||
def parse_python_str(tokens: Tokenizer) -> Any: | ||
from .markers import Value | ||
|
||
if tokens.match("QUOTED_STRING"): | ||
python_str = tokens.read().text.strip("'\"") | ||
return Value(str(python_str)) | ||
else: | ||
return tokens.raise_syntax_error( | ||
"String with single or double quote at the beginning is expected" | ||
) | ||
|
||
|
||
def parse_marker_op(tokens: Tokenizer) -> Any: | ||
from .markers import Op | ||
|
||
if tokens.try_read("IN"): | ||
return Op("in") | ||
elif tokens.try_read("NOT"): | ||
tokens.read("IN") | ||
return Op("not in") | ||
elif tokens.match("OP"): | ||
return Op(tokens.read().text) | ||
else: | ||
return tokens.raise_syntax_error( | ||
'Couldn\'t parse marker operator. Expecting one of \ | ||
"<=, <, !=, ==, >=, >, ~=, ===, not, not in"' | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
import dataclasses | ||
import re | ||
from typing import Any, Dict, Generator, Optional, Sized | ||
|
||
from .specifiers import Specifier | ||
|
||
|
||
@dataclasses.dataclass | ||
class Token: | ||
name: str | ||
text: str | ||
position: int | ||
|
||
def matches(self, name: str = "", text: str = "") -> bool: | ||
if name and self.name != name: | ||
return False | ||
if text and self.text != text: | ||
return False | ||
return True | ||
|
||
def __str__(self) -> str: | ||
return f"{self.position}\t{self.name}\t{self.text}" | ||
|
||
|
||
class ParseException(Exception): | ||
"""Parsing failed""" | ||
|
||
def __init__(self, message: str, position: int) -> None: | ||
super().__init__(message) | ||
self.position = position | ||
|
||
|
||
DEFAULT_RULES = { | ||
None: r"[ \t]+", # whitespace: not returned as tokens | ||
"LPAREN": r"\(", | ||
"RPAREN": r"\)", | ||
"LBRACKET": r"\[", | ||
"RBRACKET": r"\]", | ||
"SEMICOLON": r";", | ||
"COLON": r",", | ||
"QUOTED_STRING": re.compile( | ||
r""" | ||
('[^']*') | ||
| | ||
("[^"]*") | ||
""", | ||
re.VERBOSE, | ||
), | ||
"OP": r"===|==|~=|!=|<=|>=|<|>", | ||
"VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE), | ||
"BOOLOP": r"or|and", | ||
"IN": r"in", | ||
"NOT": r"not", | ||
"VARIABLE": re.compile( | ||
r""" | ||
python_version | ||
|python_full_version | ||
|os[._]name | ||
|sys[._]platform | ||
|platform_(release|system) | ||
|platform[._](version|machine|python_implementation) | ||
|python_implementation | ||
|implementation_(name|version) | ||
|extra | ||
""", | ||
re.VERBOSE, | ||
), | ||
"URL_SPEC": "@ *[^ ]+", | ||
"IDENTIFIER": r"([a-zA-Z0-9]|-|_|\.)+", | ||
} | ||
|
||
|
||
class Tokenizer: | ||
"""Stream of tokens for a LL(1) parser. | ||
Provides methods to examine the next token to be read, and to read it | ||
(advance to the next token). | ||
""" | ||
|
||
next_token: Any | ||
|
||
def __init__( | ||
self, source: Sized, rules: Dict[Optional[str], object] = DEFAULT_RULES | ||
) -> None: | ||
self.source = source | ||
self.rules = {name: re.compile(pattern) for name, pattern in rules.items()} | ||
self.next_token = None | ||
self.generator = self._tokenize() | ||
self.position = 0 | ||
|
||
def peek(self, *match_args: None, **match_kwargs: None) -> Any: | ||
"""Return the next token to be read""" | ||
if not self.next_token: | ||
self.next_token = next(self.generator) | ||
return self.next_token | ||
|
||
def match(self, *match_args: str, **match_kwargs: None) -> Any: | ||
"""Return True if the next token matches the given arguments""" | ||
token = self.peek() | ||
return token.matches(*match_args, **match_kwargs) | ||
|
||
def expect(self, *match_args: str, **match_kwargs: None) -> Any: | ||
"""Raise SyntaxError if the next token doesn't match given arguments""" | ||
token = self.peek() | ||
if not token.matches(*match_args, **match_kwargs): | ||
exp = " ".join( | ||
v | ||
for v in match_args | ||
+ tuple(f"{k}={v!r}" for k, v in match_kwargs.items()) | ||
if v | ||
) | ||
raise self.raise_syntax_error(f"Expected {exp}") | ||
return token | ||
|
||
def read(self, *match_args: str, **match_kwargs: None) -> Any: | ||
"""Return the next token and advance to the next token | ||
Raise SyntaxError if the token doesn't match. | ||
""" | ||
result = self.expect(*match_args, **match_kwargs) | ||
self.next_token = None | ||
return result | ||
|
||
def try_read(self, *match_args: str, **match_kwargs: None) -> Any: | ||
"""read() if the next token matches the given arguments | ||
Do nothing if it does not match. | ||
""" | ||
if self.match(*match_args, **match_kwargs): | ||
return self.read() | ||
|
||
def raise_syntax_error(self, message: str = "Invalid marker") -> Any: | ||
"""Raise SyntaxError at the given position in the marker""" | ||
at = f"at position {self.position}:" | ||
marker = " " * self.position + "^" | ||
raise ParseException( | ||
f"{message}\n{at}\n {self.source}\n {marker}", | ||
self.position, | ||
) | ||
|
||
def _make_token(self, name: str, text: str) -> Token: | ||
"""Make a token with the current position""" | ||
return Token(name, text, self.position) | ||
|
||
def _tokenize(self) -> Generator[Token, Token, None]: | ||
"""The main generator of tokens""" | ||
while self.position < len(self.source): | ||
for name, expression in self.rules.items(): | ||
match = expression.match(self.source, self.position) | ||
if match: | ||
token_text = match[0] | ||
|
||
if name: | ||
yield self._make_token(name, token_text) | ||
self.position += len(token_text) | ||
break | ||
else: | ||
raise self.raise_syntax_error() | ||
yield self._make_token("stringEnd", "") |