diff --git a/CHANGELOG.md b/CHANGELOG.md index 1085c4d5..3b521cf8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,9 @@ ## 2.0.2 (TBD) -- fix bug in parsing time when calculating pause duration between requests (#1277) - fix bug where consolidate_intersections function would mutate the passed-in graph (#1273) +- fix bug in parsing time when calculating pause duration between requests (#1277) +- refactor interals of caching and pausing between requests (#1279) - provide user-friendly error message if consolidate_intersections is run more than once (#1273) - improve docstrings (#1272 #1274) diff --git a/osmnx/_http.py b/osmnx/_http.py index 438f0013..a30edd9c 100644 --- a/osmnx/_http.py +++ b/osmnx/_http.py @@ -30,24 +30,22 @@ def _save_to_cache( """ Save a HTTP response JSON object to a file in the cache folder. - This calculates the checksum of `url` to generate the cache file name. If - the request was sent to server via POST instead of GET, then `url` should + If request was sent to server via POST instead of GET, then `url` should be a GET-style representation of the request. Response is only saved to a - cache file if `settings.use_cache` is True, `response_json` is not None, - and `ok` is True. + cache file if `settings.use_cache` is True, `ok` is True, `response_json` + is not None, and `response_json` does not contain a server "remark." Users should always pass OrderedDicts instead of dicts of parameters into request functions, so the parameters remain in the same order each time, - producing the same URL string, and thus the same hash. Otherwise the cache - will eventually contain multiple saved responses for the same request - because the URL's parameters appeared in a different order each time. + producing the same URL string, and thus the same hash. Otherwise you will + get a cache miss when the URL's parameters appeared in a different order. Parameters ---------- url The URL of the request. response_json - The JSON response from the server. + The JSON HTTP response. ok A `requests.response.ok` value. """ @@ -59,43 +57,53 @@ def _save_to_cache( msg = f"Did not save to cache because response contains remark: {response_json['remark']!r}" utils.log(msg, lg.WARNING) else: - # create the folder on the disk if it doesn't already exist - cache_folder = Path(settings.cache_folder) - cache_folder.mkdir(parents=True, exist_ok=True) - - # hash the url to make the filename succinct but unique - # sha1 digest is 160 bits = 20 bytes = 40 hexadecimal characters - checksum = sha1(url.encode("utf-8")).hexdigest() # noqa: S324 - cache_filepath = cache_folder / f"{checksum}.json" - - # dump to json, and save to file + # create cache folder on disk if it doesn't already exist + cache_filepath = _get_cache_filepath(url) + cache_filepath.parent.mkdir(parents=True, exist_ok=True) cache_filepath.write_text(json.dumps(response_json), encoding="utf-8") msg = f"Saved response to cache file {str(cache_filepath)!r}" utils.log(msg, level=lg.INFO) -def _url_in_cache(url: str) -> Path | None: +def _get_cache_filepath(key: str, extension: str = "json") -> Path: """ - Determine if a URL's response exists in the cache. + Determine a cache filepath for a key. - Calculates the checksum of `url` to determine the cache file's name. - Returns None if it cannot be found in the cache. + This uses the configured `settings.cache_folder` and calculates the 160 + bit SHA-1 hash digest (40 hexadecimal characters) of `key` to generate a + succinct but unique cache filename. Parameters ---------- - url - The URL to look for in the cache. + key + The key for which to generate a cache filepath, for example, a URL. + extension + The desired cache file extension. Returns ------- cache_filepath - Path to cached response for `url` if it exists, otherwise None. + Cache filepath corresponding to `key`. """ - # hash the url to generate the cache filename - checksum = sha1(url.encode("utf-8")).hexdigest() # noqa: S324 - cache_filepath = Path(settings.cache_folder) / f"{checksum}.json" + digest = sha1(key.encode("utf-8")).hexdigest() # noqa: S324 + return Path(settings.cache_folder) / f"{digest}.{extension}" + - # if this file exists in the cache, return its full path +def _check_cache(key: str) -> Path | None: + """ + Check if a key exists in the cache, and return its cache filepath if so. + + Parameters + ---------- + key + The key to look for in the cache. + + Returns + ------- + cache_filepath + Filepath to cached data for `key` if it exists, otherwise None. + """ + cache_filepath = _get_cache_filepath(key) return cache_filepath if cache_filepath.is_file() else None @@ -103,7 +111,7 @@ def _retrieve_from_cache(url: str) -> dict[str, Any] | list[dict[str, Any]] | No """ Retrieve a HTTP response JSON object from the cache if it exists. - Returns None if there is a server remark in the cached response. + A cache hit returns the data. A cache miss returns None. Parameters ---------- @@ -113,13 +121,12 @@ def _retrieve_from_cache(url: str) -> dict[str, Any] | list[dict[str, Any]] | No Returns ------- response_json - Cached response for `url` if it exists in the cache and does not - contain a server remark, otherwise None. + The cached response for `url` if it exists, otherwise None. """ # if the tool is configured to use the cache if settings.use_cache: # return cached response for this url if exists, otherwise return None - cache_filepath = _url_in_cache(url) + cache_filepath = _check_cache(url) if cache_filepath is not None: response_json: dict[str, Any] | list[dict[str, Any]] response_json = json.loads(cache_filepath.read_text(encoding="utf-8")) diff --git a/osmnx/_nominatim.py b/osmnx/_nominatim.py index 73ce065f..e73eda38 100644 --- a/osmnx/_nominatim.py +++ b/osmnx/_nominatim.py @@ -81,8 +81,6 @@ def _nominatim_request( params: OrderedDict[str, int | str], *, request_type: str = "search", - pause: float = 1, - error_pause: float = 60, ) -> list[dict[str, Any]]: """ Send a HTTP GET request to the Nominatim API and return response. @@ -92,13 +90,8 @@ def _nominatim_request( params Key-value pairs of parameters. request_type - {"search", "reverse", "lookup"} - Which Nominatim API endpoint to query. - pause - How long to pause before request, in seconds. Per the Nominatim usage - policy: "an absolute maximum of 1 request per second" is allowed. - error_pause - How long to pause in seconds before re-trying request if error. + Which Nominatim API endpoint to query, one of {"search", "reverse", + "lookup"}. Returns ------- @@ -120,7 +113,9 @@ def _nominatim_request( if isinstance(cached_response_json, list): return cached_response_json - # pause then request this URL + # how long to pause before request, in seconds. Per the Nominatim usage + # policy: "an absolute maximum of 1 request per second" is allowed. + pause = 1 hostname = _http._hostname_from_url(url) msg = f"Pausing {pause} second(s) before making HTTP GET request to {hostname!r}" utils.log(msg, level=lg.INFO) @@ -139,6 +134,7 @@ def _nominatim_request( # handle 429 and 504 errors by pausing then recursively re-trying request if response.status_code in {429, 504}: # pragma: no cover + error_pause = 55 msg = ( f"{hostname!r} responded {response.status_code} {response.reason}: " f"we'll retry in {error_pause} secs" diff --git a/osmnx/_overpass.py b/osmnx/_overpass.py index 280b238a..83e0bb2e 100644 --- a/osmnx/_overpass.py +++ b/osmnx/_overpass.py @@ -141,8 +141,8 @@ def _get_network_filter(network_type: str) -> str: def _get_overpass_pause( base_endpoint: str, *, - recursive_delay: float = 4, - default_duration: float = 50, + recursion_pause: float = 5, + default_pause: float = 60, ) -> float: """ Retrieve a pause duration from the Overpass API status endpoint. @@ -155,19 +155,19 @@ def _get_overpass_pause( ---------- base_endpoint Base Overpass API URL (without "/status" at the end). - recursive_delay + recursion_pause How long to wait between recursive calls if the server is currently running a query. - default_duration - If a fatal error occurs, fall back on returning this value. + default_pause + If a fatal error occurs, fall back on this liberal pause duration. Returns ------- pause The current pause duration specified by the Overpass status endpoint. """ + # if overpass rate limiting is False, then there is zero pause if not settings.overpass_rate_limit: - # if overpass rate limiting is False, then there is zero pause return 0 url = base_endpoint.rstrip("/") + "/status" @@ -185,7 +185,7 @@ def _get_overpass_pause( # cannot reach status endpoint: log error and return default duration msg = f"Unable to reach {url}, {e}" utils.log(msg, level=lg.ERROR) - return default_duration + return default_pause # try to parse the output try: @@ -195,7 +195,7 @@ def _get_overpass_pause( # cannot parse output: log error and return default duration msg = f"Unable to parse {url} response: {response_text}" utils.log(msg, level=lg.ERROR) - return default_duration + return default_pause # determine the current status of the server try: @@ -215,16 +215,16 @@ def _get_overpass_pause( pause = max(seconds, 1) # if first token is 'Currently', it is currently running a query so - # check back in recursive_delay seconds + # check back in recursion_pause seconds elif status_first_part == "Currently": - time.sleep(recursive_delay) + time.sleep(recursion_pause) pause = _get_overpass_pause(base_endpoint) # any other status is unrecognized: log error, return default duration else: msg = f"Unrecognized server status: {status!r}" utils.log(msg, level=lg.ERROR) - return default_duration + return default_pause return pause @@ -428,12 +428,7 @@ def _download_overpass_features( yield _overpass_request(OrderedDict(data=query_str)) -def _overpass_request( - data: OrderedDict[str, Any], - *, - pause: float | None = None, - error_pause: float = 55, -) -> dict[str, Any]: +def _overpass_request(data: OrderedDict[str, Any]) -> dict[str, Any]: """ Send a HTTP POST request to the Overpass API and return response. @@ -441,12 +436,6 @@ def _overpass_request( ---------- data Key-value pairs of parameters. - pause - How long to pause in seconds before request. If None, will query API - status endpoint to find when next slot is available. - error_pause - How long to pause in seconds (in addition to `pause`) before re-trying - request if error. Returns ------- @@ -464,8 +453,7 @@ def _overpass_request( return cached_response_json # pause then request this URL - if pause is None: - pause = _get_overpass_pause(settings.overpass_url) + pause = _get_overpass_pause(settings.overpass_url) hostname = _http._hostname_from_url(url) msg = f"Pausing {pause} second(s) before making HTTP POST request to {hostname!r}" utils.log(msg, level=lg.INFO) @@ -484,6 +472,7 @@ def _overpass_request( # handle 429 and 504 errors by pausing then recursively re-trying request if response.status_code in {429, 504}: # pragma: no cover + error_pause = 55 msg = ( f"{hostname!r} responded {response.status_code} {response.reason}: " f"we'll retry in {error_pause} secs" diff --git a/osmnx/elevation.py b/osmnx/elevation.py index efed1725..d5e50365 100644 --- a/osmnx/elevation.py +++ b/osmnx/elevation.py @@ -5,7 +5,6 @@ import logging as lg import multiprocessing as mp import time -from hashlib import sha1 from pathlib import Path from typing import TYPE_CHECKING from typing import Any @@ -128,10 +127,9 @@ def _build_vrt_file(raster_paths: Iterable[str | Path]) -> Path: msg = "rio-vrt must be installed as an optional dependency to build VRTs." raise ImportError(msg) - # use the sha1 hash of the sorted filepaths as the VRT filename + # determine VRT cache filepath, from stringified sorted raster filepaths raster_paths = sorted(raster_paths) - checksum = sha1(str(raster_paths).encode("utf-8")).hexdigest() # noqa: S324 - vrt_path = Path(settings.cache_folder) / f"{checksum}.vrt" + vrt_path = _http._get_cache_filepath(str(raster_paths), "vrt") # build the VRT file if it doesn't already exist in the cache if not vrt_path.is_file(): diff --git a/tests/lint_test.sh b/tests/lint_test.sh index 5b3dc547..57ef287c 100755 --- a/tests/lint_test.sh +++ b/tests/lint_test.sh @@ -4,9 +4,6 @@ set -euo pipefail # delete temp files and folders rm -r -f ./.coverage* ./.pytest_cache ./.temp ./dist ./docs/build ./*/__pycache__ -# create all the configured environment/requirements files -python ./environments/make-env-files.py - # run the pre-commit hooks for linting/formatting SKIP=no-commit-to-branch pre-commit run --all-files