From 06fc1624daa844a10eddd4eaf4865dfb68b5374d Mon Sep 17 00:00:00 2001 From: Dave Rigby Date: Fri, 9 Feb 2024 11:45:51 +0000 Subject: [PATCH] Speedup list_datasets() / load_dataset() by 2.5x Construction of the Catalog object currently takes ~7.1s to complete. This is significant as both list_datasets() and load_dataset() require the construction of a Catalog object; so essentially _any_ operation with pinecone_datasets has a startup overhead of over 7s. Looking at where this time is spent, we see that the underlying gcsfs RPC library is issing a large number of HTTP requests, and some repeatedly to the same URL. Specifically, we are issuing two GCS GET requests per dataset bucket - for example to access ANN_DEEP1B_d96_angular we observe the following calls (displayed by setting GCSFS_DEBUG=DEBUG env var): 2024-02-09 11:54:35,635 - gcsfs - DEBUG - _call -- GET: b/{}/o/{}, ('pinecone-datasets-dev', 'ANN_DEEP1B_d96_angular/metadata.json'), None 2024-02-09 11:54:35,749 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pinecone-datasets-dev/o/ANN_DEEP1B_d96_angular%2Fmetadata.json?alt=media, (), {'Range': 'bytes=0-440'} We also end up issuing multiple calls to list the bucket contents - e.g. there are 11 calls of the form: 2024-02-09 11:54:35,433 - gcsfs - DEBUG - _call -- GET: b/{}/o, ('pinecone-datasets-dev',), None In total we see 81 HTTP calls to construct a Catalog object comprising of 25 datasets. Improve this by using gcsfs' higher-level fs.glob() method to fetch all matching filenames, without having to call listdir() and retrieve stats on each file. This results in a much simpler set of calls - two calls to list the bucket content, then one call per dataset: 2024-02-09 11:54:00,715 - gcsfs - DEBUG - _call -- GET: b/{}/o, ('pinecone-datasets-dev',), None 2024-02-09 11:54:03,139 - gcsfs - DEBUG - _call -- GET: b/{}/o, ('pinecone-datasets-dev',), None 2024-02-09 11:54:04,337 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pinecone-datasets-dev/o/ANN_DEEP1B_d96_angular%2Fmetadata.json?alt=media, (), {} 2024-02-09 11:54:04,338 - gcsfs - DEBUG - _call -- GET: https://storage.googleapis.com/download/storage/v1/b/pinecone-datasets-dev/o/ANN_Fashion-MNIST_d784_euclidean%2Fmetadata.json?alt=media, (), {} ... The total the number of HTTP calls is reduced to 26. This has a corresponding reduction in wall-clock time to struct to 3.1s --- pinecone_datasets/catalog.py | 39 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/pinecone_datasets/catalog.py b/pinecone_datasets/catalog.py index 9a76cff..79d5542 100644 --- a/pinecone_datasets/catalog.py +++ b/pinecone_datasets/catalog.py @@ -72,26 +72,25 @@ def load(**kwargs) -> "Catalog": ) collected_datasets = [] try: - for f in fs.listdir(public_datasets_base_path): - if f["type"] == "directory": - try: - prefix = "gs" if isinstance(fs, gcsfs.GCSFileSystem) else "s3" - with fs.open(f"{prefix}://{f['name']}/metadata.json") as f: - try: - this_dataset_json = json.load(f) - except json.JSONDecodeError: - warnings.warn( - f"Not a JSON: Invalid metadata.json for {f['name']}, skipping" - ) - try: - this_dataset = DatasetMetadata(**this_dataset_json) - collected_datasets.append(this_dataset) - except ValidationError: - warnings.warn( - f"metadata file for dataset: {f['name']} is not valid, skipping" - ) - except FileNotFoundError: - pass + for name in fs.glob(public_datasets_base_path + '/*/metadata.json'): + try: + prefix = "gs" if isinstance(fs, gcsfs.GCSFileSystem) else "s3" + with fs.open(prefix + "://" + name) as f: + try: + this_dataset_json = json.load(f) + except json.JSONDecodeError: + warnings.warn( + f"Not a JSON: Invalid metadata.json for {name}, skipping" + ) + try: + this_dataset = DatasetMetadata(**this_dataset_json) + collected_datasets.append(this_dataset) + except ValidationError: + warnings.warn( + f"metadata file for dataset: {name} is not valid, skipping" + ) + except FileNotFoundError: + pass return Catalog(datasets=collected_datasets) except SSLCertVerificationError: raise ValueError("There is an Issue with loading the public catalog")