Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handwritten parser for parsing requirements #484

Merged
merged 6 commits into from
Jul 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 228 additions & 0 deletions packaging/_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
# The docstring for each parse function contains the grammar for the rule.
# The grammar uses a simple EBNF-inspired syntax:
#
# - Uppercase names are tokens
# - Lowercase names are rules (parsed with a parse_* function)
# - Parentheses are used for grouping
# - A | means either-or
# - A * means 0 or more
# - A + means 1 or more
# - A ? means 0 or 1

from ast import literal_eval
from typing import Any, List, NamedTuple, Tuple, Union

from ._tokenizer import Tokenizer


class Node:
def __init__(self, value: str) -> None:
self.value = value

def __str__(self) -> str:
return self.value

def __repr__(self) -> str:
return f"<{self.__class__.__name__}('{self}')>"

def serialize(self) -> str:
raise NotImplementedError


class Variable(Node):
def serialize(self) -> str:
return str(self)


class Value(Node):
def serialize(self) -> str:
return f'"{self}"'


class Op(Node):
def serialize(self) -> str:
return str(self)


MarkerVar = Union[Variable, Value]
MarkerItem = Tuple[MarkerVar, Op, MarkerVar]
# MarkerAtom = Union[MarkerItem, List["MarkerAtom"]]
# MarkerList = List[Union["MarkerList", MarkerAtom, str]]
# mypy does not suport recursive type definition
# https://github.com/python/mypy/issues/731
MarkerAtom = Any
MarkerList = List[Any]


class Requirement(NamedTuple):
name: str
url: str
extras: List[str]
specifier: str
marker: str


def parse_named_requirement(requirement: str) -> Requirement:
"""
named_requirement:
IDENTIFIER extras (URL_SPEC | specifier) (SEMICOLON marker_expr)? END
"""
tokens = Tokenizer(requirement)
tokens.expect("IDENTIFIER", error_message="Expression must begin with package name")
name = tokens.read("IDENTIFIER").text
extras = parse_extras(tokens)
specifier = ""
url = ""
if tokens.match("URL_SPEC"):
url = tokens.read().text[1:].strip()
elif not tokens.match("END"):
specifier = parse_specifier(tokens)
if tokens.try_read("SEMICOLON"):
marker = ""
while not tokens.match("END"):
# we don't validate markers here, it's done later as part of
# packaging/requirements.py
marker += tokens.read().text
else:
marker = ""
tokens.expect(
"END",
error_message="Expected semicolon (followed by markers) or end of string",
)
return Requirement(name, url, extras, specifier, marker)


def parse_extras(tokens: Tokenizer) -> List[str]:
"""
extras: LBRACKET (IDENTIFIER (COMMA IDENTIFIER)*)? RBRACKET
"""
extras = []
if tokens.try_read("LBRACKET"):
while tokens.match("IDENTIFIER"):
extras.append(tokens.read("IDENTIFIER").text)
if not tokens.match("RBRACKET"):
tokens.read("COMMA", error_message="Missing comma after extra")
if not tokens.match("COMMA") and tokens.match("RBRACKET"):
pradyunsg marked this conversation as resolved.
Show resolved Hide resolved
break
tokens.read("RBRACKET", error_message="Closing square bracket is missing")
return extras


def parse_specifier(tokens: Tokenizer) -> str:
"""
specifier:
LPAREN version_many? RPAREN | version_many
"""
lparen = False
if tokens.try_read("LPAREN"):
lparen = True
parsed_specifiers = parse_version_many(tokens)
if lparen and not tokens.try_read("RPAREN"):
tokens.raise_syntax_error(message="Closing right parenthesis is missing")
return parsed_specifiers


def parse_version_many(tokens: Tokenizer) -> str:
"""
version_many: OP VERSION (COMMA OP VERSION)*
"""
parsed_specifiers = ""
while tokens.match("OP"):
parsed_specifiers += tokens.read("OP").text
if tokens.match("VERSION"):
parsed_specifiers += tokens.read("VERSION").text
else:
tokens.raise_syntax_error(message="Missing version")
if not tokens.match("COMMA"):
break
tokens.expect("COMMA", error_message="Missing comma after version")
parsed_specifiers += tokens.read("COMMA").text
return parsed_specifiers


def parse_marker_expr(tokens: Tokenizer) -> MarkerList:
"""
marker_expr: MARKER_ATOM (BOOLOP + MARKER_ATOM)+
"""
expression = [parse_marker_atom(tokens)]
while tokens.match("BOOLOP"):
tok = tokens.read("BOOLOP")
expr_right = parse_marker_atom(tokens)
expression.extend((tok.text, expr_right))
return expression


def parse_marker_atom(tokens: Tokenizer) -> MarkerAtom:
"""
marker_atom: LPAREN marker_expr RPAREN | marker_item
"""
if tokens.try_read("LPAREN"):
marker = parse_marker_expr(tokens)
tokens.read("RPAREN", error_message="Closing right parenthesis is missing")
return marker
else:
return parse_marker_item(tokens)


def parse_marker_item(tokens: Tokenizer) -> MarkerItem:
"""
marker_item: marker_var marker_op marker_var
"""
marker_var_left = parse_marker_var(tokens)
marker_op = parse_marker_op(tokens)
marker_var_right = parse_marker_var(tokens)
return (marker_var_left, marker_op, marker_var_right)


def parse_marker_var(tokens: Tokenizer) -> MarkerVar:
"""
marker_var: env_var | python_str
"""
if tokens.match("VARIABLE"):
return parse_env_var(tokens)
else:
return parse_python_str(tokens)


def parse_env_var(tokens: Tokenizer) -> Variable:
"""
env_var: VARIABLE
"""
env_var = tokens.read("VARIABLE").text.replace(".", "_")
if (
env_var == "platform_python_implementation"
or env_var == "python_implementation"
):
return Variable("platform_python_implementation")
else:
return Variable(env_var)


def parse_python_str(tokens: Tokenizer) -> Value:
"""
python_str: QUOTED_STRING
"""
token = tokens.read(
"QUOTED_STRING",
error_message="String with single or double quote at the beginning is expected",
).text
python_str = literal_eval(token)
return Value(str(python_str))


def parse_marker_op(tokens: Tokenizer) -> Op:
"""
marker_op: IN | NOT IN | OP
"""
if tokens.try_read("IN"):
return Op("in")
elif tokens.try_read("NOT"):
tokens.read("IN", error_message="NOT token must be follewed by IN token")
return Op("not in")
elif tokens.match("OP"):
return Op(tokens.read().text)
else:
return tokens.raise_syntax_error(
message='Couldn\'t parse marker operator. Expecting one of \
"<=, <, !=, ==, >=, >, ~=, ===, not, not in"'
)
164 changes: 164 additions & 0 deletions packaging/_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import re
from typing import Dict, Generator, NoReturn, Optional

from .specifiers import Specifier


class Token:
def __init__(self, name: str, text: str, position: int) -> None:
self.name = name
self.text = text
self.position = position

def matches(self, name: str = "") -> bool:
if name and self.name != name:
return False
return True


class ParseExceptionError(Exception):
"""
Parsing failed.
"""

def __init__(self, message: str, position: int) -> None:
super().__init__(message)
self.position = position


DEFAULT_RULES = {
"LPAREN": r"\s*\(",
"RPAREN": r"\s*\)",
"LBRACKET": r"\s*\[",
"RBRACKET": r"\s*\]",
"SEMICOLON": r"\s*;",
"COMMA": r"\s*,",
"QUOTED_STRING": re.compile(
r"""
\s*
(
('[^']*')
|
("[^"]*")
)
""",
re.VERBOSE,
),
"OP": r"\s*(===|==|~=|!=|<=|>=|<|>)",
"BOOLOP": r"\s*(or|and)",
"IN": r"\s*in",
"NOT": r"\s*not",
"VARIABLE": re.compile(
r"""
\s*
(
python_version
|python_full_version
|os[._]name
|sys[._]platform
|platform_(release|system)
|platform[._](version|machine|python_implementation)
|python_implementation
|implementation_(name|version)
|extra
)
""",
re.VERBOSE,
),
"VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE),
"URL_SPEC": r"\s*@ *[^ ]+",
"IDENTIFIER": r"\s*[a-zA-Z0-9._-]+",
}


class Tokenizer:
"""Stream of tokens for a LL(1) parser.

Provides methods to examine the next token to be read, and to read it
(advance to the next token).
"""

next_token: Optional[Token]

def __init__(self, source: str, rules: Dict[str, object] = DEFAULT_RULES) -> None:
self.source = source
self.rules = {name: re.compile(pattern) for name, pattern in rules.items()}
self.next_token = None
self.generator = self._tokenize()
self.position = 0

def peek(self) -> Token:
"""
Return the next token to be read.
"""
if not self.next_token:
self.next_token = next(self.generator)
return self.next_token

def match(self, *name: str) -> bool:
"""
Return True if the next token matches the given arguments.
"""
token = self.peek()
return token.matches(*name)

def expect(self, *name: str, error_message: str) -> Token:
"""
Raise SyntaxError if the next token doesn't match given arguments.
"""
token = self.peek()
if not token.matches(*name):
raise self.raise_syntax_error(message=error_message)
return token

def read(self, *name: str, error_message: str = "") -> Token:
"""Return the next token and advance to the next token.

Raise SyntaxError if the token doesn't match.
"""
result = self.expect(*name, error_message=error_message)
self.next_token = None
return result

def try_read(self, *name: str) -> Optional[Token]:
"""read() if the next token matches the given arguments.

Do nothing if it does not match.
"""
if self.match(*name):
return self.read()
return None

def raise_syntax_error(self, *, message: str) -> NoReturn:
"""
Raise SyntaxError at the given position in the marker.
"""
at = f"at position {self.position}:"
marker = " " * self.position + "^"
raise ParseExceptionError(
f"{message}\n{at}\n {self.source}\n {marker}",
self.position,
)

def _make_token(self, name: str, text: str) -> Token:
"""
Make a token with the current position.
"""
return Token(name, text, self.position)

def _tokenize(self) -> Generator[Token, Token, None]:
"""
The main generator of tokens.
"""
while self.position < len(self.source):
for name, expression in self.rules.items():
match = expression.match(self.source, self.position)
if match:
token_text = match[0]

yield self._make_token(name, token_text.strip())
self.position += len(token_text)
break
else:
raise self.raise_syntax_error(message="Unrecognized token")
yield self._make_token("END", "")
Loading