Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace vendored html5lib with stdlib #10291

Merged
merged 2 commits into from
Jan 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions news/10291.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Changed ``PackageFinder`` to parse HTML documents using the stdlib
:class:`html.parser.HTMLParser` class instead of the ``html5lib`` package. For
now, the deprecated ``html5lib`` code remains and can be used with the
``--use-deprecated=html5lib`` command line option, but it will be removed in a
future pip release.
7 changes: 6 additions & 1 deletion src/pip/_internal/cli/cmdoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,7 +964,12 @@ def check_list_path_option(options: Values) -> None:
metavar="feature",
action="append",
default=[],
choices=["legacy-resolver", "out-of-tree-build", "backtrack-on-build-failures"],
choices=[
"legacy-resolver",
"out-of-tree-build",
"backtrack-on-build-failures",
"html5lib",
],
help=("Enable deprecated functionality, that will be removed in the future."),
)

Expand Down
1 change: 1 addition & 0 deletions src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,4 +502,5 @@ def _build_package_finder(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)
1 change: 1 addition & 0 deletions src/pip/_internal/commands/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def _build_package_finder(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)

def get_available_package_versions(self, options: Values, args: List[Any]) -> None:
Expand Down
1 change: 1 addition & 0 deletions src/pip/_internal/commands/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def _build_package_finder(
return PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)

def run(self, options: Values, args: List[str]) -> int:
Expand Down
118 changes: 103 additions & 15 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,20 @@
import urllib.parse
import urllib.request
import xml.etree.ElementTree
from html.parser import HTMLParser
from optparse import Values
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
MutableMapping,
NamedTuple,
Optional,
Sequence,
Tuple,
Union,
)

Expand All @@ -39,6 +44,11 @@

from .sources import CandidatesFromPage, LinkSource, build_source

if TYPE_CHECKING:
from typing import Protocol
else:
Protocol = object

logger = logging.getLogger(__name__)

HTMLElement = xml.etree.ElementTree.Element
Expand Down Expand Up @@ -163,6 +173,8 @@ def _determine_base_url(document: HTMLElement, page_url: str) -> str:
:param document: An HTML document representation. The current
implementation expects the result of ``html5lib.parse()``.
:param page_url: The URL of the HTML document.

TODO: Remove when `html5lib` is dropped.
"""
for base in document.findall(".//base"):
href = base.get("href")
Expand Down Expand Up @@ -234,20 +246,20 @@ def _clean_link(url: str) -> str:


def _create_link_from_element(
anchor: HTMLElement,
element_attribs: Dict[str, Optional[str]],
page_url: str,
base_url: str,
) -> Optional[Link]:
"""
Convert an anchor element in a simple repository page to a Link.
Convert an anchor element's attributes in a simple repository page to a Link.
"""
href = anchor.get("href")
href = element_attribs.get("href")
if not href:
return None

url = _clean_link(urllib.parse.urljoin(base_url, href))
pyrequire = anchor.get("data-requires-python")
yanked_reason = anchor.get("data-yanked")
pyrequire = element_attribs.get("data-requires-python")
yanked_reason = element_attribs.get("data-yanked")

link = Link(
url,
Expand All @@ -271,32 +283,40 @@ def __hash__(self) -> int:
return hash(self.page.url)


def with_cached_html_pages(
fn: Callable[["HTMLPage"], Iterable[Link]],
) -> Callable[["HTMLPage"], List[Link]]:
class ParseLinks(Protocol):
def __call__(
self, page: "HTMLPage", use_deprecated_html5lib: bool
) -> Iterable[Link]:
...


def with_cached_html_pages(fn: ParseLinks) -> ParseLinks:
"""
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
function's result (keyed by CacheablePageContent), unless the HTMLPage
`page` has `page.cache_link_parsing == False`.
"""

@functools.lru_cache(maxsize=None)
def wrapper(cacheable_page: CacheablePageContent) -> List[Link]:
return list(fn(cacheable_page.page))
def wrapper(
cacheable_page: CacheablePageContent, use_deprecated_html5lib: bool
) -> List[Link]:
return list(fn(cacheable_page.page, use_deprecated_html5lib))

@functools.wraps(fn)
def wrapper_wrapper(page: "HTMLPage") -> List[Link]:
def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]:
if page.cache_link_parsing:
return wrapper(CacheablePageContent(page))
return list(fn(page))
return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
return list(fn(page, use_deprecated_html5lib))

return wrapper_wrapper


@with_cached_html_pages
def parse_links(page: "HTMLPage") -> Iterable[Link]:
def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
"""
Parse an HTML document, and yield its anchor elements as Link objects.

TODO: Remove when `html5lib` is dropped.
"""
document = html5lib.parse(
page.content,
Expand All @@ -307,6 +327,31 @@ def parse_links(page: "HTMLPage") -> Iterable[Link]:
url = page.url
base_url = _determine_base_url(document, url)
for anchor in document.findall(".//a"):
link = _create_link_from_element(
anchor.attrib,
page_url=url,
base_url=base_url,
)
if link is None:
continue
yield link


@with_cached_html_pages
def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]:
"""
Parse an HTML document, and yield its anchor elements as Link objects.
"""
if use_deprecated_html5lib:
return _parse_links_html5lib(page)

parser = HTMLLinkParser()
encoding = page.encoding or "utf-8"
parser.feed(page.content.decode(encoding))

url = page.url
base_url = parser.base_url or url
for anchor in parser.anchors:
link = _create_link_from_element(
anchor,
page_url=url,
Expand Down Expand Up @@ -343,6 +388,49 @@ def __str__(self) -> str:
return redact_auth_from_url(self.url)


class HTMLLinkParser(HTMLParser):
"""
HTMLParser that keeps the first base HREF and a list of all anchor
elements' attributes.
"""

def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self._seen_decl = False
self.base_url: Optional[str] = None
self.anchors: List[Dict[str, Optional[str]]] = []

def handle_decl(self, decl: str) -> None:
if decl != "DOCTYPE html":
self._raise_error()
self._seen_decl = True

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
if not self._seen_decl:
self._raise_error()

if tag == "base" and self.base_url is None:
href = self.get_href(attrs)
if href is not None:
self.base_url = href
elif tag == "a":
self.anchors.append(dict(attrs))

def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
for name, value in attrs:
if name == "href":
return value
return None

def _raise_error(self) -> None:
raise ValueError(
"HTML doctype missing or incorrect. Expected <!DOCTYPE html>.\n\n"
"If you believe this error to be incorrect, try passing the "
"command line option --use-deprecated=html5lib and please leave "
"a comment on the pip issue at https://github.com/pypa/pip/issues/10825."
)


def _handle_get_page_fail(
link: Link,
reason: Union[str, Exception],
Expand Down
7 changes: 6 additions & 1 deletion src/pip/_internal/index/package_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,7 @@ def __init__(
link_collector: LinkCollector,
target_python: TargetPython,
allow_yanked: bool,
use_deprecated_html5lib: bool,
format_control: Optional[FormatControl] = None,
candidate_prefs: Optional[CandidatePreferences] = None,
ignore_requires_python: Optional[bool] = None,
Expand All @@ -604,6 +605,7 @@ def __init__(
self._ignore_requires_python = ignore_requires_python
self._link_collector = link_collector
self._target_python = target_python
self._use_deprecated_html5lib = use_deprecated_html5lib

self.format_control = format_control

Expand All @@ -620,6 +622,8 @@ def create(
link_collector: LinkCollector,
selection_prefs: SelectionPreferences,
target_python: Optional[TargetPython] = None,
*,
use_deprecated_html5lib: bool,
) -> "PackageFinder":
"""Create a PackageFinder.

Expand All @@ -644,6 +648,7 @@ def create(
allow_yanked=selection_prefs.allow_yanked,
format_control=selection_prefs.format_control,
ignore_requires_python=selection_prefs.ignore_requires_python,
use_deprecated_html5lib=use_deprecated_html5lib,
)

@property
Expand Down Expand Up @@ -765,7 +770,7 @@ def process_project_url(
if html_page is None:
return []

page_links = list(parse_links(html_page))
page_links = list(parse_links(html_page, self._use_deprecated_html5lib))

with indent_log():
package_links = self.evaluate_links(
Expand Down
3 changes: 3 additions & 0 deletions src/pip/_internal/self_outdated_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ def pip_self_version_check(session: PipSession, options: optparse.Values) -> Non
finder = PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
use_deprecated_html5lib=(
"html5lib" in options.deprecated_features_enabled
),
)
best_candidate = finder.find_best_candidate("pip").best_candidate
if best_candidate is None:
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/datarequire/fakepackage/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html><head><title>Links for fakepackage</title><meta name="api-version" value="2" /></head><body><h1>Links for fakepackage</h1>
<a data-requires-python='' href="/fakepackage-1.0.0.tar.gz#md5=00000000000000000000000000000000" rel="internal">fakepackage-1.0.0.tar.gz</a><br/>
<a data-requires-python='&lt;2.7' href="/fakepackage-2.6.0.tar.gz#md5=00000000000000000000000000000000" rel="internal">fakepackage-2.6.0.tar.gz</a><br/>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/dev/bar/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a href="bar-1.0.tar.gz">bar-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/in dex/simple/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a href="../../../packages/simple-1.0.tar.gz#md5=4bdf78ebb7911f215c1972cf71b378f0">simple-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/pre/bar/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a href="bar-1.0.tar.gz">bar-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/simple/simple/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a href="../../../packages/simple-1.0.tar.gz#md5=4bdf78ebb7911f215c1972cf71b378f0">simple-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/yanked/simple/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a href="../../../packages/simple-1.0.tar.gz">simple-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/indexes/yanked_all/simple/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
<body>
<a data-yanked="test reason message" href="../../../packages/simple-1.0.tar.gz">simple-1.0.tar.gz</a>
Expand Down
1 change: 1 addition & 0 deletions tests/data/packages3/dinner/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html><head><title>PyPI Mirror</title></head>
<body>
<h1>PyPI Mirror</h1>
Expand Down
1 change: 1 addition & 0 deletions tests/data/packages3/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html><head><title>PyPI Mirror</title></head>
<body>
<h1>PyPI Mirror</h1>
Expand Down
1 change: 1 addition & 0 deletions tests/data/packages3/requiredinner/index.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html><head><title>PyPI Mirror</title></head>
<body>
<h1>PyPI Mirror</h1>
Expand Down
1 change: 1 addition & 0 deletions tests/functional/test_build_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def run_with_build_env(
finder = PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
use_deprecated_html5lib=False,
)

with global_tempdir_manager():
Expand Down
1 change: 1 addition & 0 deletions tests/functional/test_new_resolver_hashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def _create_find_links(script: PipTestEnvironment) -> _FindLinks:
index_html = script.scratch_path / "index.html"
index_html.write_text(
"""
<!DOCTYPE html>
<a href="{sdist_url}#sha256={sdist_hash}">{sdist_path.stem}</a>
<a href="{wheel_url}#sha256={wheel_hash}">{wheel_path.stem}</a>
""".format(
Expand Down
2 changes: 2 additions & 0 deletions tests/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def make_test_finder(
allow_all_prereleases: bool = False,
session: Optional[PipSession] = None,
target_python: Optional[TargetPython] = None,
use_deprecated_html5lib: bool = False,
) -> PackageFinder:
"""
Create a PackageFinder for testing purposes.
Expand All @@ -159,6 +160,7 @@ def make_test_finder(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
use_deprecated_html5lib=use_deprecated_html5lib,
)


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/resolution_resolvelib/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def finder(data: TestData) -> Iterator[PackageFinder]:
scope = SearchScope([str(data.packages)], [])
collector = LinkCollector(session, scope)
prefs = SelectionPreferences(allow_yanked=False)
finder = PackageFinder.create(collector, prefs)
finder = PackageFinder.create(collector, prefs, use_deprecated_html5lib=False)
yield finder


Expand Down
Loading