From 7781e31880f7994e9b75905728a4d7cb40e73732 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Fri, 3 Feb 2023 20:09:25 +0100 Subject: [PATCH 1/6] honor pagination when getting tags and adjust API to match types add raise_for_status and test getting many tags from ghcr.io Signed-off-by: Wolf Vollprecht --- examples/conda-mirror.py | 1 - oras/client.py | 2 +- oras/container.py | 4 +++- oras/provider.py | 33 +++++++++++++++++++++++++++++---- oras/tests/test_oras.py | 15 ++++++++++++--- oras/tests/test_utils.py | 1 - setup.py | 1 - 7 files changed, 45 insertions(+), 12 deletions(-) diff --git a/examples/conda-mirror.py b/examples/conda-mirror.py index 1b5d795..a74be55 100644 --- a/examples/conda-mirror.py +++ b/examples/conda-mirror.py @@ -33,7 +33,6 @@ class CondaMirror(oras.provider.Registry): } def inspect(self, name): - # Parse the name into a container container = self.get_container(name) diff --git a/oras/client.py b/oras/client.py index e3a6628..a3ac500 100644 --- a/oras/client.py +++ b/oras/client.py @@ -103,7 +103,7 @@ def get_tags(self, name: str, N: int = 10_000) -> List[str]: :param N: number of tags :type N: int """ - return self.remote.get_tags(name, N=N).json() + return self.remote.get_tags(name, N=N) def push(self, *args, **kwargs): """ diff --git a/oras/container.py b/oras/container.py index 4761344..884d7fd 100644 --- a/oras/container.py +++ b/oras/container.py @@ -54,7 +54,9 @@ def get_blob_url(self, digest: str) -> str: def upload_blob_url(self) -> str: return f"{self.registry}/v2/{self.api_prefix}/blobs/uploads/" - def tags_url(self, N=10_000) -> str: + def tags_url(self, N=10_000, query=None) -> str: + if query: + return f"{self.registry}/v2/{self.api_prefix}/tags/list?{query}" return f"{self.registry}/v2/{self.api_prefix}/tags/list?n={N}" def put_manifest_url(self) -> str: diff --git a/oras/provider.py b/oras/provider.py index 111c031..20d5806 100644 --- a/oras/provider.py +++ b/oras/provider.py @@ -4,6 +4,7 @@ import copy import os +import urllib from typing import List, Optional, Tuple, Union import jsonschema @@ -236,8 +237,34 @@ def get_tags( :param N: number of tags :type N: int """ - tags_url = f"{self.prefix}://{container.tags_url(N)}" # type: ignore - return self.do_request(tags_url, "GET", headers=self.headers) + tags_url = f"{self.prefix}://{container.tags_url(N=N)}" # type: ignore + + tags: List[str] = [] + has_next_link = True + # get all tags using the pagination + while len(tags) < N and has_next_link: + res = self.do_request(tags_url, "GET", headers=self.headers) + + # raise before trying to get `json` value + res.raise_for_status() + + if res.headers.get("Link"): + link = res.headers.get("Link") + # if we have a next link, that looks something like: + # ; rel="next" + # we want to extract the url and get the rest of the tags + assert link.endswith('; rel="next"') + next_link = link[link.find("<") + 1 : link.find(">")] + query = urllib.parse.urlparse(next_link).query + tags_url = f"{self.prefix}://{container.tags_url(query=query)}" # type: ignore + else: + has_next_link = False + + # if the package does not exist, the response is an + # {"errors":[{"code":"NAME_UNKNOWN","message":"repository name not known to registry"}]} + tags += res.json().get("tags", []) + + return tags @ensure_container def get_blob( @@ -548,7 +575,6 @@ def push(self, *args, **kwargs) -> requests.Response: # Upload files as blobs for blob in kwargs.get("files", []): - # You can provide a blob + content type if ":" in str(blob): blob, media_type = str(blob).split(":", 1) @@ -809,7 +835,6 @@ def authenticate_request(self, originalResponse: requests.Response) -> bool: h = oras.auth.parse_auth_header(authHeaderRaw) if "Authorization" not in headers: - # First try to request an anonymous token logger.debug("No Authorization, requesting anonymous token") if self.request_anonymous_token(h): diff --git a/oras/tests/test_oras.py b/oras/tests/test_oras.py index dfea823..e64e956 100644 --- a/oras/tests/test_oras.py +++ b/oras/tests/test_oras.py @@ -72,9 +72,7 @@ def test_basic_push_pull(tmp_path): # Test getting tags tags = client.get_tags(target) - for key in ["name", "tags"]: - assert key in tags - assert "v1" in tags["tags"] + assert "v1" in tags # Test pulling elsewhere files = client.pull(target=target, outdir=tmp_path) @@ -94,6 +92,17 @@ def test_basic_push_pull(tmp_path): assert res.status_code == 201 +def test_get_many_tags(): + """ + Test getting many tags + """ + client = oras.client.OrasClient(hostname="ghcr.io", insecure=False) + tags = client.get_tags( + "channel-mirrors/conda-forge/linux-aarch64/arrow-cpp", N=100000 + ) + assert len(tags) > 1000 + + @pytest.mark.skipif(with_auth, reason="token auth is needed for push and pull") def test_directory_push_pull(tmp_path): """ diff --git a/oras/tests/test_utils.py b/oras/tests/test_utils.py index deb3e13..a08c3b5 100644 --- a/oras/tests/test_utils.py +++ b/oras/tests/test_utils.py @@ -45,7 +45,6 @@ def test_write_bad_json(tmp_path): def test_write_json(tmp_path): - good_json = {"Wakkawakkawakka": [True, "2", 3]} tmpfile = str(tmp_path / "good_json_file.txt") diff --git a/setup.py b/setup.py index 1203271..ad6234c 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,6 @@ def get_reqs(lookup=None, key="INSTALL_REQUIRES"): ################################################################################ if __name__ == "__main__": - INSTALL_REQUIRES = get_reqs(lookup) TESTS_REQUIRES = get_reqs(lookup, "TESTS_REQUIRES") INSTALL_REQUIRES_ALL = get_reqs(lookup, "INSTALL_REQUIRES_ALL") From a871bfec35fb5956dfc051be151214ea37841e68 Mon Sep 17 00:00:00 2001 From: vsoch Date: Fri, 3 Feb 2023 21:34:01 -0700 Subject: [PATCH 2/6] add general function to get tags this extends the updated get tags function (with pagination!) to use a general function, so a future caller can use the same functionality. Signed-off-by: vsoch --- CHANGELOG.md | 1 + docs/conf.py | 2 +- oras/defaults.py | 2 +- oras/provider.py | 80 +++++++++++++++++++++++++++++++++++------------- oras/version.py | 2 +- 5 files changed, 62 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 361ec28..afbb1cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are: The versions coincide with releases on pip. Only major versions will be released as tags on Github. ## [0.0.x](https://github.com/oras-project/oras-py/tree/main) (0.0.x) + - pagination for tags (and general function for pagination) (0.1.14) - expose upload_blob function to be consistent (0.1.13) - ensure we always strip path separators before pull/push (0.1.12) - exposing download_blob to the user since it uses streaming (0.1.11) diff --git a/docs/conf.py b/docs/conf.py index 1ca329c..bca4e54 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -28,7 +28,7 @@ project = "Oras Python" html_title = "Oras Python" -copyright = "2022, Oras Python Developers" +copyright = "2023, Oras Python Developers" author = "@vsoch" # The full version, including alpha/beta/rc tags diff --git a/oras/defaults.py b/oras/defaults.py index 28910f6..cccde2d 100644 --- a/oras/defaults.py +++ b/oras/defaults.py @@ -1,5 +1,5 @@ __author__ = "Vanessa Sochat" -__copyright__ = "Copyright 2021-2022, Vanessa Sochat" +__copyright__ = "Copyright The ORAS Authors" __license__ = "Apache-2.0" diff --git a/oras/provider.py b/oras/provider.py index 20d5806..0ec6b2d 100644 --- a/oras/provider.py +++ b/oras/provider.py @@ -238,33 +238,69 @@ def get_tags( :type N: int """ tags_url = f"{self.prefix}://{container.tags_url(N=N)}" # type: ignore + return self.do_paginated_request(tags_url, N=N, unwrap="tags") - tags: List[str] = [] + def _get_response_links(self, response: requests.Response) -> dict: + """ + Get a named or all response links. + + response.links does not seem reliable to always parse, so we + revert to using the raw requests library to parse the headers. + """ + # The links header to parse (empty if None) + link_header = response.headers.get("Link") + if not link_header: + return {} + links = requests.utils.parse_header_links( + link_header.rstrip(">").replace(">,<", ",<") + ) + return {x["rel"]: x["url"] for x in links} + + def do_paginated_request( + self, + url: str, + N: int = 10_000, + unwrap: Optional[str] = None, + headers: Optional[dict] = None, + ) -> List: + """ + Paginate a request for a URL. + + We look for the "Link" header to get the next URL to ping. If provided, + we "unwrap" the provided field in the response json. + """ + results: List[str] = [] has_next_link = True - # get all tags using the pagination - while len(tags) < N and has_next_link: - res = self.do_request(tags_url, "GET", headers=self.headers) - - # raise before trying to get `json` value - res.raise_for_status() - - if res.headers.get("Link"): - link = res.headers.get("Link") - # if we have a next link, that looks something like: - # ; rel="next" - # we want to extract the url and get the rest of the tags - assert link.endswith('; rel="next"') - next_link = link[link.find("<") + 1 : link.find(">")] - query = urllib.parse.urlparse(next_link).query - tags_url = f"{self.prefix}://{container.tags_url(query=query)}" # type: ignore - else: + headers = headers or self.headers + + # Save the base url to add parameters to, assuming only the params change + query = urllib.parse.urlparse(url).query + base_url = url.replace("?" + query, "") + + # get all results using the pagination + while len(results) < N and has_next_link: + response = self.do_request(url, "GET", headers=self.headers) + + # Check 200 response, show errors if any + self._check_200_response(response) + link = self._get_response_links(response).get("next") + + new_results = response.json() + if unwrap: + new_results = new_results.get(unwrap) + if new_results: + results += new_results + + # Get the next link + if not link: has_next_link = False + break - # if the package does not exist, the response is an - # {"errors":[{"code":"NAME_UNKNOWN","message":"repository name not known to registry"}]} - tags += res.json().get("tags", []) + # get query parameters to continue with next set of tags + query = urllib.parse.urlparse(link).query + url = f"{base_url}?{query}" - return tags + return results @ensure_container def get_blob( diff --git a/oras/version.py b/oras/version.py index 1a0abb4..fd96af0 100644 --- a/oras/version.py +++ b/oras/version.py @@ -2,7 +2,7 @@ __copyright__ = "Copyright The ORAS Authors." __license__ = "Apache-2.0" -__version__ = "0.1.13" +__version__ = "0.1.14" AUTHOR = "Vanessa Sochat" EMAIL = "vsoch@users.noreply.github.com" NAME = "oras" From 6148720d24cda6b7bc7fd16b66ab32d49a919303 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Sat, 4 Feb 2023 10:49:21 +0100 Subject: [PATCH 3/6] use callable in paginated request, and use links dictionary of requests to find link Signed-off-by: Wolf Vollprecht --- oras/provider.py | 69 ++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 44 deletions(-) diff --git a/oras/provider.py b/oras/provider.py index 0ec6b2d..7f3df5c 100644 --- a/oras/provider.py +++ b/oras/provider.py @@ -5,7 +5,7 @@ import copy import os import urllib -from typing import List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Union import jsonschema import requests @@ -234,73 +234,54 @@ def get_tags( :param container: parsed container URI :type container: oras.container.Container or str - :param N: number of tags + :param N: number of tags, -1 for all :type N: int """ tags_url = f"{self.prefix}://{container.tags_url(N=N)}" # type: ignore - return self.do_paginated_request(tags_url, N=N, unwrap="tags") + tags: List[str] = [] - def _get_response_links(self, response: requests.Response) -> dict: - """ - Get a named or all response links. + def extract_tags(response: requests.Response) -> bool: + json = response.json() + tags.extend(json.get("tags", [])) + return len(tags) < N or N == -1 - response.links does not seem reliable to always parse, so we - revert to using the raw requests library to parse the headers. - """ - # The links header to parse (empty if None) - link_header = response.headers.get("Link") - if not link_header: - return {} - links = requests.utils.parse_header_links( - link_header.rstrip(">").replace(">,<", ",<") - ) - return {x["rel"]: x["url"] for x in links} + self._do_paginated_request(tags_url, callable=extract_tags) + return tags - def do_paginated_request( - self, - url: str, - N: int = 10_000, - unwrap: Optional[str] = None, - headers: Optional[dict] = None, - ) -> List: + def _do_paginated_request( + self, url: str, callable: Callable[[requests.Response], bool] + ): """ Paginate a request for a URL. - We look for the "Link" header to get the next URL to ping. If provided, - we "unwrap" the provided field in the response json. + We look for the "Link" header to get the next URL to ping. If + the callable returns True, we continue to the next page, otherwise + we stop. """ - results: List[str] = [] - has_next_link = True - headers = headers or self.headers # Save the base url to add parameters to, assuming only the params change - query = urllib.parse.urlparse(url).query - base_url = url.replace("?" + query, "") + parts = urllib.parse.urlparse(url) + base_url = f"{parts.scheme}://{parts.netloc}" # get all results using the pagination - while len(results) < N and has_next_link: + while True: response = self.do_request(url, "GET", headers=self.headers) # Check 200 response, show errors if any self._check_200_response(response) - link = self._get_response_links(response).get("next") - new_results = response.json() - if unwrap: - new_results = new_results.get(unwrap) - if new_results: - results += new_results + want_more = callable(response) + if not want_more: + break + + link = response.links.get("next", {}).get("url") # Get the next link if not link: - has_next_link = False break - # get query parameters to continue with next set of tags - query = urllib.parse.urlparse(link).query - url = f"{base_url}?{query}" - - return results + # use link + base url to continue with next page + url = f"{base_url}{link}" @ensure_container def get_blob( From 956beef95a504b3a16016c3462f964ab425e98d7 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Sat, 4 Feb 2023 19:23:18 +0100 Subject: [PATCH 4/6] use optional[int] for N Signed-off-by: Wolf Vollprecht --- oras/container.py | 4 +--- oras/provider.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/oras/container.py b/oras/container.py index 884d7fd..4761344 100644 --- a/oras/container.py +++ b/oras/container.py @@ -54,9 +54,7 @@ def get_blob_url(self, digest: str) -> str: def upload_blob_url(self) -> str: return f"{self.registry}/v2/{self.api_prefix}/blobs/uploads/" - def tags_url(self, N=10_000, query=None) -> str: - if query: - return f"{self.registry}/v2/{self.api_prefix}/tags/list?{query}" + def tags_url(self, N=10_000) -> str: return f"{self.registry}/v2/{self.api_prefix}/tags/list?n={N}" def put_manifest_url(self) -> str: diff --git a/oras/provider.py b/oras/provider.py index 7f3df5c..0e62f1b 100644 --- a/oras/provider.py +++ b/oras/provider.py @@ -227,23 +227,30 @@ def upload_blob( @ensure_container def get_tags( - self, container: Union[str, oras.container.Container], N: int = 10_000 + self, container: Union[str, oras.container.Container], N: Optional[int] = None ) -> List[str]: """ Retrieve tags for a package. :param container: parsed container URI :type container: oras.container.Container or str - :param N: number of tags, -1 for all - :type N: int + :param N: limit number of tags, None for all (default) + :type N: Optional[int] """ - tags_url = f"{self.prefix}://{container.tags_url(N=N)}" # type: ignore + retrieve_all = N is None + if not N: + n_tags = 10_000 + else: + n_tags = N + tags_url = f"{self.prefix}://{container.tags_url(N=n_tags)}" # type: ignore tags: List[str] = [] def extract_tags(response: requests.Response) -> bool: json = response.json() - tags.extend(json.get("tags", [])) - return len(tags) < N or N == -1 + new_tags = json.get("tags", []) + tags.extend(new_tags) + # return true if we should continue + return len(new_tags) and (retrieve_all or len(tags) < n_tags) self._do_paginated_request(tags_url, callable=extract_tags) return tags From b361fe0ea8d53f81204fb5c490d94a32efe6297f Mon Sep 17 00:00:00 2001 From: vsoch Date: Sat, 4 Feb 2023 12:01:43 -0700 Subject: [PATCH 5/6] update types for tags function, add more tests and docs Signed-off-by: vsoch --- .pre-commit-config.yaml | 1 + docs/getting_started/user-guide.md | 24 ++++++++++++++++++++++++ oras/client.py | 4 ++-- oras/provider.py | 25 +++++++++++++++---------- oras/tests/test_oras.py | 27 +++++++++++++++++++++++++-- 5 files changed, 67 insertions(+), 14 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d2c686..ad34a4e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,7 @@ repos: language: python types: [python] entry: mypy + args: [] - id: flake8 name: flake8 diff --git a/docs/getting_started/user-guide.md b/docs/getting_started/user-guide.md index ab67e58..cf52fed 100644 --- a/docs/getting_started/user-guide.md +++ b/docs/getting_started/user-guide.md @@ -177,6 +177,30 @@ you should do blobs (layers) and the config first. +### Tags + +We provide a simple "get_tags" function to make it easy to instantiate a client and ask for tags from +a registry. Let's say we want to get tags from conda-forge. We could create a client: + +```python +import oras.client + +client = oras.client.OrasClient(hostname="ghcr.io", insecure=False) +``` + +And then ask for either a specific number of tags: + +```python +tags = client.get_tags("channel-mirrors/conda-forge/linux-aarch64/arrow-cpp", N=1005) +``` + +Or more likely, just ask for all tags (the default). + +```python +tags = client.get_tags("channel-mirrors/conda-forge/linux-aarch64/arrow-cpp") +``` +You can read more about how registries provide tags [at the distribution spec](https://github.com/opencontainers/distribution-spec/blob/067a0f5b0e256583bb9a088f72cba85ed043d1d2/spec.md?plain=1#L471-L513). + ### Push Interactions Let's start with a very basic push interaction, and this one diff --git a/oras/client.py b/oras/client.py index a3ac500..e234b3e 100644 --- a/oras/client.py +++ b/oras/client.py @@ -94,13 +94,13 @@ def version(self, return_items: bool = False) -> Union[dict, str]: # Otherwise return a string that can be printed return "\n".join(["%s: %s" % (k, v) for k, v in versions.items()]) - def get_tags(self, name: str, N: int = 10_000) -> List[str]: + def get_tags(self, name: str, N: int = -1) -> List[str]: """ Retrieve tags for a package. :param name: container URI to parse :type name: str - :param N: number of tags + :param N: number of tags (-1 to get all tags) :type N: int """ return self.remote.get_tags(name, N=N) diff --git a/oras/provider.py b/oras/provider.py index 0e62f1b..8fa22aa 100644 --- a/oras/provider.py +++ b/oras/provider.py @@ -227,32 +227,37 @@ def upload_blob( @ensure_container def get_tags( - self, container: Union[str, oras.container.Container], N: Optional[int] = None + self, container: Union[str, oras.container.Container], N: int = -1 ) -> List[str]: """ Retrieve tags for a package. :param container: parsed container URI :type container: oras.container.Container or str - :param N: limit number of tags, None for all (default) + :param N: limit number of tags, -1 for all (default) :type N: Optional[int] """ - retrieve_all = N is None - if not N: - n_tags = 10_000 - else: - n_tags = N - tags_url = f"{self.prefix}://{container.tags_url(N=n_tags)}" # type: ignore + # -1 is a flag for retrieving all, if set we use arbitrarily high number + retrieve_all = N == -1 + N = N if (N and N > 0) else 10_0000 + + tags_url = f"{self.prefix}://{container.tags_url(N=N)}" # type: ignore tags: List[str] = [] def extract_tags(response: requests.Response) -> bool: + """ + Determine if we should continue based on new tags and under limit. + """ json = response.json() new_tags = json.get("tags", []) tags.extend(new_tags) - # return true if we should continue - return len(new_tags) and (retrieve_all or len(tags) < n_tags) + return bool(len(new_tags) and (retrieve_all or len(tags) < N)) self._do_paginated_request(tags_url, callable=extract_tags) + + # If we got a longer set than was asked for + if len(tags) > N: + tags = tags[:N] return tags def _do_paginated_request( diff --git a/oras/tests/test_oras.py b/oras/tests/test_oras.py index e64e956..1ae4f07 100644 --- a/oras/tests/test_oras.py +++ b/oras/tests/test_oras.py @@ -97,10 +97,33 @@ def test_get_many_tags(): Test getting many tags """ client = oras.client.OrasClient(hostname="ghcr.io", insecure=False) + + # Test getting tags with a limit set tags = client.get_tags( - "channel-mirrors/conda-forge/linux-aarch64/arrow-cpp", N=100000 + "channel-mirrors/conda-forge/linux-aarch64/arrow-cpp", N=1005 + ) + assert len(tags) == 1005 + + # This should retrieve all tags (defaults to -1) + tags = client.get_tags("channel-mirrors/conda-forge/linux-aarch64/arrow-cpp") + assert len(tags) > 1500 + + # Same result (assuming doesn't change in small seconds between) + same_tags = client.get_tags( + "channel-mirrors/conda-forge/linux-aarch64/arrow-cpp", N=-1 ) - assert len(tags) > 1000 + assert not set(tags).difference(set(same_tags)) + + # None defaults to -1 too + same_tags = client.get_tags( + "channel-mirrors/conda-forge/linux-aarch64/arrow-cpp", N=None + ) + assert not set(tags).difference(set(same_tags)) + + # Small number of tags + tags = client.get_tags("channel-mirrors/conda-forge/linux-aarch64/arrow-cpp", N=10) + assert not set(tags).difference(set(same_tags)) + assert len(tags) == 10 @pytest.mark.skipif(with_auth, reason="token auth is needed for push and pull") From 9f0bd68a3085842983feed2d99f8b15273fc49f3 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sat, 4 Feb 2023 12:02:48 -0700 Subject: [PATCH 6/6] remove empty args - not using it Signed-off-by: vsoch --- .pre-commit-config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ad34a4e..9d2c686 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,7 +30,6 @@ repos: language: python types: [python] entry: mypy - args: [] - id: flake8 name: flake8