From 81186ca4e99d2e777ecf5946c1ac04809be65f6f Mon Sep 17 00:00:00 2001 From: Nick Farrell Date: Thu, 28 Nov 2024 05:36:31 +1100 Subject: [PATCH 1/3] s3: do not treat an empty iter_key response as successful When requesting a list of files using the S3 protocol, a valid response will always include a Contents key in the JSON-encoded body. If this is not present, allow an exception to bubble up, instead of pretending that the bucket/prefix is empty. It is preferable to raise an exception than to pretend to have a valid response. --- rohmu/object_storage/s3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rohmu/object_storage/s3.py b/rohmu/object_storage/s3.py index e19fd693..51247f69 100644 --- a/rohmu/object_storage/s3.py +++ b/rohmu/object_storage/s3.py @@ -369,7 +369,7 @@ def iter_key( self.stats.operation(StorageOperation.iter_key) response = self.get_client().list_objects_v2(**args) - for item in response.get("Contents", []): + for item in response["Contents"]: if with_metadata: try: metadata = {k.lower(): v for k, v in self._metadata_for_key(item["Key"]).items()} From 6a19820d2a0848118ecea9e914010e2aabe8d2dc Mon Sep 17 00:00:00 2001 From: Nick Farrell Date: Thu, 28 Nov 2024 05:43:11 +1100 Subject: [PATCH 2/3] s3: collect HTTP response stats for unstable operations In some situations, inconsistent responses have been observed while reading/iterating over files using the S3 protocol. Record the HTTP status codes received during this operations as stats. --- rohmu/object_storage/s3.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/rohmu/object_storage/s3.py b/rohmu/object_storage/s3.py index 51247f69..87617d04 100644 --- a/rohmu/object_storage/s3.py +++ b/rohmu/object_storage/s3.py @@ -368,6 +368,9 @@ def iter_key( args["ContinuationToken"] = continuation_token self.stats.operation(StorageOperation.iter_key) response = self.get_client().list_objects_v2(**args) + status_code = response.get("ResponseMetadata", {}).get("HTTPStatusCode") + if status_code: + self.stats.increase(metric="rohmu.s3.iter_key_response", tags={"status_code": str(status_code)}) for item in response["Contents"]: if with_metadata: @@ -405,17 +408,22 @@ def _get_object_stream(self, key: str, byte_range: Optional[tuple[int, int]]) -> kwargs: dict[str, Any] = {} if byte_range: kwargs["Range"] = f"bytes={byte_range[0]}-{byte_range[1]}" + status_code : int|None = None try: # Actual usage is accounted for in # _read_object_to_fileobj, although that omits the initial # get_object call if it fails. response = self.get_client().get_object(Bucket=self.bucket_name, Key=path, **kwargs) + status_code = response.get("ResponseMetadata", {}).get("HTTPStatusCode") except botocore.exceptions.ClientError as ex: status_code = ex.response.get("ResponseMetadata", {}).get("HTTPStatusCode") if status_code == 404: raise FileNotFoundFromStorageError(path) else: raise StorageError(f"Fetching the remote object {path} failed") from ex + finally: + if status_code: + self.stats.increase(metric="rohmu.s3.get_object_stream_response", tags={"status_code": str(status_code)}) return response["Body"], response["ContentLength"], response["Metadata"] def _read_object_to_fileobj( From 0e3639a6e86b0e98a82e63e9231d7f48795017e2 Mon Sep 17 00:00:00 2001 From: Nick Farrell Date: Thu, 28 Nov 2024 08:39:31 +1100 Subject: [PATCH 3/3] s3: do not mask failures to retrieve metadata If a file's metadata is being requested and it cannot be found, this is a real error. We should allow this exception to bubble up, rather than masking it. Some backends (e.g. minio) may provide a `KeyCount` value. If this is present and zero, assume no records are present, and do not look for the `Contents` list. --- rohmu/object_storage/s3.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/rohmu/object_storage/s3.py b/rohmu/object_storage/s3.py index 87617d04..f7175e97 100644 --- a/rohmu/object_storage/s3.py +++ b/rohmu/object_storage/s3.py @@ -372,25 +372,23 @@ def iter_key( if status_code: self.stats.increase(metric="rohmu.s3.iter_key_response", tags={"status_code": str(status_code)}) - for item in response["Contents"]: - if with_metadata: - try: + if response.get("KeyCount") != 0: + for item in response["Contents"]: + if with_metadata: metadata = {k.lower(): v for k, v in self._metadata_for_key(item["Key"]).items()} - except FileNotFoundFromStorageError: - continue - else: - metadata = None - name = self.format_key_from_backend(item["Key"]) - yield IterKeyItem( - type=KEY_TYPE_OBJECT, - value={ - "last_modified": item["LastModified"], - "md5": item["ETag"].strip('"'), - "metadata": metadata, - "name": name, - "size": item["Size"], - }, - ) + else: + metadata = None + name = self.format_key_from_backend(item["Key"]) + yield IterKeyItem( + type=KEY_TYPE_OBJECT, + value={ + "last_modified": item["LastModified"], + "md5": item["ETag"].strip('"'), + "metadata": metadata, + "name": name, + "size": item["Size"], + }, + ) for common_prefix in response.get("CommonPrefixes", []): yield IterKeyItem( @@ -408,7 +406,7 @@ def _get_object_stream(self, key: str, byte_range: Optional[tuple[int, int]]) -> kwargs: dict[str, Any] = {} if byte_range: kwargs["Range"] = f"bytes={byte_range[0]}-{byte_range[1]}" - status_code : int|None = None + status_code: int | None = None try: # Actual usage is accounted for in # _read_object_to_fileobj, although that omits the initial