diff --git a/news/10291.feature.rst b/news/10291.feature.rst new file mode 100644 index 00000000000..7291b629124 --- /dev/null +++ b/news/10291.feature.rst @@ -0,0 +1,5 @@ +Changed ``PackageFinder`` to parse HTML documents using the stdlib +:class:`html.parser.HTMLParser` class instead of the ``html5lib`` package. For +now, the deprecated ``html5lib`` code remains and can be used with the +``--use-deprecated=html5lib`` command line option, but it will be removed in a +future pip release. diff --git a/src/pip/_internal/cli/cmdoptions.py b/src/pip/_internal/cli/cmdoptions.py index e9806fd79d0..71b1d190691 100644 --- a/src/pip/_internal/cli/cmdoptions.py +++ b/src/pip/_internal/cli/cmdoptions.py @@ -964,7 +964,12 @@ def check_list_path_option(options: Values) -> None: metavar="feature", action="append", default=[], - choices=["legacy-resolver", "out-of-tree-build", "backtrack-on-build-failures"], + choices=[ + "legacy-resolver", + "out-of-tree-build", + "backtrack-on-build-failures", + "html5lib", + ], help=("Enable deprecated functionality, that will be removed in the future."), ) diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index 8dc00e32826..5d4d1f0f45b 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -502,4 +502,5 @@ def _build_package_finder( link_collector=link_collector, selection_prefs=selection_prefs, target_python=target_python, + use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled, ) diff --git a/src/pip/_internal/commands/index.py b/src/pip/_internal/commands/index.py index b4bf0ac06e1..9d8aae3b542 100644 --- a/src/pip/_internal/commands/index.py +++ b/src/pip/_internal/commands/index.py @@ -97,6 +97,7 @@ def _build_package_finder( link_collector=link_collector, selection_prefs=selection_prefs, target_python=target_python, + use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled, ) def get_available_package_versions(self, options: Values, args: List[Any]) -> None: diff --git a/src/pip/_internal/commands/list.py b/src/pip/_internal/commands/list.py index adac51058ce..57f05e00829 100644 --- a/src/pip/_internal/commands/list.py +++ b/src/pip/_internal/commands/list.py @@ -149,6 +149,7 @@ def _build_package_finder( return PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, + use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled, ) def run(self, options: Values, args: List[str]) -> int: diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index d9412234eed..c30c37661f5 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -12,15 +12,20 @@ import urllib.parse import urllib.request import xml.etree.ElementTree +from html.parser import HTMLParser from optparse import Values from typing import ( + TYPE_CHECKING, + Any, Callable, + Dict, Iterable, List, MutableMapping, NamedTuple, Optional, Sequence, + Tuple, Union, ) @@ -39,6 +44,11 @@ from .sources import CandidatesFromPage, LinkSource, build_source +if TYPE_CHECKING: + from typing import Protocol +else: + Protocol = object + logger = logging.getLogger(__name__) HTMLElement = xml.etree.ElementTree.Element @@ -163,6 +173,8 @@ def _determine_base_url(document: HTMLElement, page_url: str) -> str: :param document: An HTML document representation. The current implementation expects the result of ``html5lib.parse()``. :param page_url: The URL of the HTML document. + + TODO: Remove when `html5lib` is dropped. """ for base in document.findall(".//base"): href = base.get("href") @@ -234,20 +246,20 @@ def _clean_link(url: str) -> str: def _create_link_from_element( - anchor: HTMLElement, + element_attribs: Dict[str, Optional[str]], page_url: str, base_url: str, ) -> Optional[Link]: """ - Convert an anchor element in a simple repository page to a Link. + Convert an anchor element's attributes in a simple repository page to a Link. """ - href = anchor.get("href") + href = element_attribs.get("href") if not href: return None url = _clean_link(urllib.parse.urljoin(base_url, href)) - pyrequire = anchor.get("data-requires-python") - yanked_reason = anchor.get("data-yanked") + pyrequire = element_attribs.get("data-requires-python") + yanked_reason = element_attribs.get("data-yanked") link = Link( url, @@ -271,9 +283,14 @@ def __hash__(self) -> int: return hash(self.page.url) -def with_cached_html_pages( - fn: Callable[["HTMLPage"], Iterable[Link]], -) -> Callable[["HTMLPage"], List[Link]]: +class ParseLinks(Protocol): + def __call__( + self, page: "HTMLPage", use_deprecated_html5lib: bool + ) -> Iterable[Link]: + ... + + +def with_cached_html_pages(fn: ParseLinks) -> ParseLinks: """ Given a function that parses an Iterable[Link] from an HTMLPage, cache the function's result (keyed by CacheablePageContent), unless the HTMLPage @@ -281,22 +298,25 @@ def with_cached_html_pages( """ @functools.lru_cache(maxsize=None) - def wrapper(cacheable_page: CacheablePageContent) -> List[Link]: - return list(fn(cacheable_page.page)) + def wrapper( + cacheable_page: CacheablePageContent, use_deprecated_html5lib: bool + ) -> List[Link]: + return list(fn(cacheable_page.page, use_deprecated_html5lib)) @functools.wraps(fn) - def wrapper_wrapper(page: "HTMLPage") -> List[Link]: + def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]: if page.cache_link_parsing: - return wrapper(CacheablePageContent(page)) - return list(fn(page)) + return wrapper(CacheablePageContent(page), use_deprecated_html5lib) + return list(fn(page, use_deprecated_html5lib)) return wrapper_wrapper -@with_cached_html_pages -def parse_links(page: "HTMLPage") -> Iterable[Link]: +def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]: """ Parse an HTML document, and yield its anchor elements as Link objects. + + TODO: Remove when `html5lib` is dropped. """ document = html5lib.parse( page.content, @@ -307,6 +327,31 @@ def parse_links(page: "HTMLPage") -> Iterable[Link]: url = page.url base_url = _determine_base_url(document, url) for anchor in document.findall(".//a"): + link = _create_link_from_element( + anchor.attrib, + page_url=url, + base_url=base_url, + ) + if link is None: + continue + yield link + + +@with_cached_html_pages +def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]: + """ + Parse an HTML document, and yield its anchor elements as Link objects. + """ + if use_deprecated_html5lib: + return _parse_links_html5lib(page) + + parser = HTMLLinkParser() + encoding = page.encoding or "utf-8" + parser.feed(page.content.decode(encoding)) + + url = page.url + base_url = parser.base_url or url + for anchor in parser.anchors: link = _create_link_from_element( anchor, page_url=url, @@ -343,6 +388,49 @@ def __str__(self) -> str: return redact_auth_from_url(self.url) +class HTMLLinkParser(HTMLParser): + """ + HTMLParser that keeps the first base HREF and a list of all anchor + elements' attributes. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self._seen_decl = False + self.base_url: Optional[str] = None + self.anchors: List[Dict[str, Optional[str]]] = [] + + def handle_decl(self, decl: str) -> None: + if decl != "DOCTYPE html": + self._raise_error() + self._seen_decl = True + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: + if not self._seen_decl: + self._raise_error() + + if tag == "base" and self.base_url is None: + href = self.get_href(attrs) + if href is not None: + self.base_url = href + elif tag == "a": + self.anchors.append(dict(attrs)) + + def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]: + for name, value in attrs: + if name == "href": + return value + return None + + def _raise_error(self) -> None: + raise ValueError( + "HTML doctype missing or incorrect. Expected .\n\n" + "If you believe this error to be incorrect, try passing the " + "command line option --use-deprecated=html5lib and please leave " + "a comment on the pip issue at https://github.com/pypa/pip/issues/10825." + ) + + def _handle_get_page_fail( link: Link, reason: Union[str, Exception], diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index 86b7b65dd8a..223d06df67e 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -580,6 +580,7 @@ def __init__( link_collector: LinkCollector, target_python: TargetPython, allow_yanked: bool, + use_deprecated_html5lib: bool, format_control: Optional[FormatControl] = None, candidate_prefs: Optional[CandidatePreferences] = None, ignore_requires_python: Optional[bool] = None, @@ -604,6 +605,7 @@ def __init__( self._ignore_requires_python = ignore_requires_python self._link_collector = link_collector self._target_python = target_python + self._use_deprecated_html5lib = use_deprecated_html5lib self.format_control = format_control @@ -620,6 +622,8 @@ def create( link_collector: LinkCollector, selection_prefs: SelectionPreferences, target_python: Optional[TargetPython] = None, + *, + use_deprecated_html5lib: bool, ) -> "PackageFinder": """Create a PackageFinder. @@ -644,6 +648,7 @@ def create( allow_yanked=selection_prefs.allow_yanked, format_control=selection_prefs.format_control, ignore_requires_python=selection_prefs.ignore_requires_python, + use_deprecated_html5lib=use_deprecated_html5lib, ) @property @@ -765,7 +770,7 @@ def process_project_url( if html_page is None: return [] - page_links = list(parse_links(html_page)) + page_links = list(parse_links(html_page, self._use_deprecated_html5lib)) with indent_log(): package_links = self.evaluate_links( diff --git a/src/pip/_internal/self_outdated_check.py b/src/pip/_internal/self_outdated_check.py index 19a5f280f60..7300e0ea4c0 100644 --- a/src/pip/_internal/self_outdated_check.py +++ b/src/pip/_internal/self_outdated_check.py @@ -141,6 +141,9 @@ def pip_self_version_check(session: PipSession, options: optparse.Values) -> Non finder = PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, + use_deprecated_html5lib=( + "html5lib" in options.deprecated_features_enabled + ), ) best_candidate = finder.find_best_candidate("pip").best_candidate if best_candidate is None: diff --git a/tests/data/indexes/datarequire/fakepackage/index.html b/tests/data/indexes/datarequire/fakepackage/index.html index 0ca8b9dc3a2..25bf4aa21d5 100644 --- a/tests/data/indexes/datarequire/fakepackage/index.html +++ b/tests/data/indexes/datarequire/fakepackage/index.html @@ -1,3 +1,4 @@ + Links for fakepackage

Links for fakepackage

fakepackage-1.0.0.tar.gz
fakepackage-2.6.0.tar.gz
diff --git a/tests/data/indexes/dev/bar/index.html b/tests/data/indexes/dev/bar/index.html index bcee309212c..c0da6561310 100644 --- a/tests/data/indexes/dev/bar/index.html +++ b/tests/data/indexes/dev/bar/index.html @@ -1,3 +1,4 @@ + bar-1.0.tar.gz diff --git a/tests/data/indexes/in dex/simple/index.html b/tests/data/indexes/in dex/simple/index.html index dba6cc3ebd6..cb078ea7b19 100644 --- a/tests/data/indexes/in dex/simple/index.html +++ b/tests/data/indexes/in dex/simple/index.html @@ -1,3 +1,4 @@ + simple-1.0.tar.gz diff --git a/tests/data/indexes/pre/bar/index.html b/tests/data/indexes/pre/bar/index.html index c50d88bc863..da76454f604 100644 --- a/tests/data/indexes/pre/bar/index.html +++ b/tests/data/indexes/pre/bar/index.html @@ -1,3 +1,4 @@ + bar-1.0.tar.gz diff --git a/tests/data/indexes/simple/simple/index.html b/tests/data/indexes/simple/simple/index.html index dba6cc3ebd6..cb078ea7b19 100644 --- a/tests/data/indexes/simple/simple/index.html +++ b/tests/data/indexes/simple/simple/index.html @@ -1,3 +1,4 @@ + simple-1.0.tar.gz diff --git a/tests/data/indexes/yanked/simple/index.html b/tests/data/indexes/yanked/simple/index.html index bf4994310be..14181a3a0ad 100644 --- a/tests/data/indexes/yanked/simple/index.html +++ b/tests/data/indexes/yanked/simple/index.html @@ -1,3 +1,4 @@ + simple-1.0.tar.gz diff --git a/tests/data/indexes/yanked_all/simple/index.html b/tests/data/indexes/yanked_all/simple/index.html index 732340baa45..060f9904465 100644 --- a/tests/data/indexes/yanked_all/simple/index.html +++ b/tests/data/indexes/yanked_all/simple/index.html @@ -1,3 +1,4 @@ + simple-1.0.tar.gz diff --git a/tests/data/packages3/dinner/index.html b/tests/data/packages3/dinner/index.html index e258eb16b40..52a16b11686 100644 --- a/tests/data/packages3/dinner/index.html +++ b/tests/data/packages3/dinner/index.html @@ -1,3 +1,4 @@ + PyPI Mirror

PyPI Mirror

diff --git a/tests/data/packages3/index.html b/tests/data/packages3/index.html index d66e70ec631..262207b6a62 100644 --- a/tests/data/packages3/index.html +++ b/tests/data/packages3/index.html @@ -1,3 +1,4 @@ + PyPI Mirror

PyPI Mirror

diff --git a/tests/data/packages3/requiredinner/index.html b/tests/data/packages3/requiredinner/index.html index 0981c9c7246..52a4e66673c 100644 --- a/tests/data/packages3/requiredinner/index.html +++ b/tests/data/packages3/requiredinner/index.html @@ -1,3 +1,4 @@ + PyPI Mirror

PyPI Mirror

diff --git a/tests/functional/test_build_env.py b/tests/functional/test_build_env.py index d114e8d2359..285f21fda89 100644 --- a/tests/functional/test_build_env.py +++ b/tests/functional/test_build_env.py @@ -48,6 +48,7 @@ def run_with_build_env( finder = PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, + use_deprecated_html5lib=False, ) with global_tempdir_manager(): diff --git a/tests/functional/test_new_resolver_hashes.py b/tests/functional/test_new_resolver_hashes.py index 39c1d012c65..4c4c2253e99 100644 --- a/tests/functional/test_new_resolver_hashes.py +++ b/tests/functional/test_new_resolver_hashes.py @@ -26,6 +26,7 @@ def _create_find_links(script: PipTestEnvironment) -> _FindLinks: index_html = script.scratch_path / "index.html" index_html.write_text( """ + {sdist_path.stem} {wheel_path.stem} """.format( diff --git a/tests/lib/__init__.py b/tests/lib/__init__.py index 06849d2d705..cb6364ebbb7 100644 --- a/tests/lib/__init__.py +++ b/tests/lib/__init__.py @@ -141,6 +141,7 @@ def make_test_finder( allow_all_prereleases: bool = False, session: Optional[PipSession] = None, target_python: Optional[TargetPython] = None, + use_deprecated_html5lib: bool = False, ) -> PackageFinder: """ Create a PackageFinder for testing purposes. @@ -159,6 +160,7 @@ def make_test_finder( link_collector=link_collector, selection_prefs=selection_prefs, target_python=target_python, + use_deprecated_html5lib=use_deprecated_html5lib, ) diff --git a/tests/unit/resolution_resolvelib/conftest.py b/tests/unit/resolution_resolvelib/conftest.py index cfd440570e6..545ed7c3506 100644 --- a/tests/unit/resolution_resolvelib/conftest.py +++ b/tests/unit/resolution_resolvelib/conftest.py @@ -26,7 +26,7 @@ def finder(data: TestData) -> Iterator[PackageFinder]: scope = SearchScope([str(data.packages)], []) collector = LinkCollector(session, scope) prefs = SelectionPreferences(allow_yanked=False) - finder = PackageFinder.create(collector, prefs) + finder = PackageFinder.create(collector, prefs, use_deprecated_html5lib=False) yield finder diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 8b60c302915..2225a32bf45 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -421,7 +421,11 @@ def test_clean_link(url: str, clean_url: str) -> None: def _test_parse_links_data_attribute( anchor_html: str, attr: str, expected: Optional[str] ) -> None: - html = f'{anchor_html}' + html = ( + "" + '' + "{}" + ).format(anchor_html) html_bytes = html.encode("utf-8") page = HTMLPage( html_bytes, @@ -430,7 +434,7 @@ def _test_parse_links_data_attribute( # the page content isn't cached. url=f"https://example.com/simple-{uuid.uuid4()}/", ) - links = list(parse_links(page)) + links = list(parse_links(page, use_deprecated_html5lib=False)) (link,) = links actual = getattr(link, attr) assert actual == expected @@ -467,7 +471,7 @@ def test_parse_links__requires_python( # Test not present. ('', None), # Test present with no value. - ('', ""), + ('', None), # Test the empty string. ('', ""), # Test a non-empty string. @@ -492,6 +496,7 @@ def test_parse_links__yanked_reason(anchor_html: str, expected: Optional[str]) - def test_parse_links_caches_same_page_by_url() -> None: html = ( + "" '' '' ) @@ -521,14 +526,14 @@ def test_parse_links_caches_same_page_by_url() -> None: cache_link_parsing=False, ) - parsed_links_1 = list(parse_links(page_1)) + parsed_links_1 = list(parse_links(page_1, use_deprecated_html5lib=False)) assert len(parsed_links_1) == 1 assert "pkg1" in parsed_links_1[0].url - parsed_links_2 = list(parse_links(page_2)) + parsed_links_2 = list(parse_links(page_2, use_deprecated_html5lib=False)) assert parsed_links_2 == parsed_links_1 - parsed_links_3 = list(parse_links(page_3)) + parsed_links_3 = list(parse_links(page_3, use_deprecated_html5lib=False)) assert len(parsed_links_3) == 1 assert parsed_links_3 != parsed_links_1 assert "pkg2" in parsed_links_3[0].url diff --git a/tests/unit/test_finder.py b/tests/unit/test_finder.py index f8646da4f5f..deff295828d 100644 --- a/tests/unit/test_finder.py +++ b/tests/unit/test_finder.py @@ -79,7 +79,10 @@ def test_incorrect_case_file_index(data: TestData) -> None: @pytest.mark.network -def test_finder_detects_latest_already_satisfied_find_links(data: TestData) -> None: +@pytest.mark.parametrize("use_deprecated_html5lib", [False, True]) +def test_finder_detects_latest_already_satisfied_find_links( + data: TestData, use_deprecated_html5lib: bool +) -> None: """Test PackageFinder detects latest already satisfied using find-links""" req = install_req_from_line("simple", None) # the latest simple in local pkgs is 3.0 @@ -89,14 +92,19 @@ def test_finder_detects_latest_already_satisfied_find_links(data: TestData) -> N version=parse_version(latest_version), ) req.satisfied_by = satisfied_by - finder = make_test_finder(find_links=[data.find_links]) + finder = make_test_finder( + find_links=[data.find_links], use_deprecated_html5lib=use_deprecated_html5lib + ) with pytest.raises(BestVersionAlreadyInstalled): finder.find_requirement(req, True) @pytest.mark.network -def test_finder_detects_latest_already_satisfied_pypi_links() -> None: +@pytest.mark.parametrize("use_deprecated_html5lib", [False, True]) +def test_finder_detects_latest_already_satisfied_pypi_links( + use_deprecated_html5lib: bool, +) -> None: """Test PackageFinder detects latest already satisfied using pypi links""" req = install_req_from_line("initools", None) # the latest initools on PyPI is 0.3.1 @@ -106,7 +114,10 @@ def test_finder_detects_latest_already_satisfied_pypi_links() -> None: version=parse_version(latest_version), ) req.satisfied_by = satisfied_by - finder = make_test_finder(index_urls=["http://pypi.org/simple/"]) + finder = make_test_finder( + index_urls=["http://pypi.org/simple/"], + use_deprecated_html5lib=use_deprecated_html5lib, + ) with pytest.raises(BestVersionAlreadyInstalled): finder.find_requirement(req, True) diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 39106f63b23..fa98f28c89c 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -575,6 +575,7 @@ def test_create__candidate_prefs( finder = PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, + use_deprecated_html5lib=False, ) candidate_prefs = finder._candidate_prefs assert candidate_prefs.allow_all_prereleases == allow_all_prereleases @@ -591,6 +592,7 @@ def test_create__link_collector(self) -> None: finder = PackageFinder.create( link_collector=link_collector, selection_prefs=SelectionPreferences(allow_yanked=True), + use_deprecated_html5lib=False, ) assert finder._link_collector is link_collector @@ -608,6 +610,7 @@ def test_create__target_python(self) -> None: link_collector=link_collector, selection_prefs=SelectionPreferences(allow_yanked=True), target_python=target_python, + use_deprecated_html5lib=False, ) actual_target_python = finder._target_python # The target_python attribute should be set as is. @@ -627,6 +630,7 @@ def test_create__target_python_none(self) -> None: link_collector=link_collector, selection_prefs=SelectionPreferences(allow_yanked=True), target_python=None, + use_deprecated_html5lib=False, ) # Spot-check the default TargetPython object. actual_target_python = finder._target_python @@ -646,6 +650,7 @@ def test_create__allow_yanked(self, allow_yanked: bool) -> None: finder = PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, + use_deprecated_html5lib=False, ) assert finder._allow_yanked == allow_yanked @@ -665,6 +670,7 @@ def test_create__ignore_requires_python(self, ignore_requires_python: bool) -> N finder = PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, + use_deprecated_html5lib=False, ) assert finder._ignore_requires_python == ignore_requires_python @@ -684,6 +690,7 @@ def test_create__format_control(self) -> None: finder = PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, + use_deprecated_html5lib=False, ) actual_format_control = finder.format_control assert actual_format_control is format_control @@ -724,6 +731,7 @@ def test_make_link_evaluator( allow_yanked=allow_yanked, format_control=format_control, ignore_requires_python=ignore_requires_python, + use_deprecated_html5lib=False, ) # Pass a project_name that will be different from canonical_name. @@ -772,6 +780,7 @@ def test_make_candidate_evaluator( target_python=target_python, allow_yanked=True, candidate_prefs=candidate_prefs, + use_deprecated_html5lib=False, ) specifier = SpecifierSet() diff --git a/tests/unit/test_self_check_outdated.py b/tests/unit/test_self_check_outdated.py index 22214fbcfbd..d313f3fd019 100644 --- a/tests/unit/test_self_check_outdated.py +++ b/tests/unit/test_self_check_outdated.py @@ -84,6 +84,7 @@ def _options() -> mock.Mock: no_index=False, pre=False, cache_dir="", + deprecated_features_enabled=[], )