From 13aa85b97c7801f7240e9e676740f65c83f0d94d Mon Sep 17 00:00:00 2001 From: Ryan Lambert Date: Tue, 10 Aug 2021 17:33:52 -0600 Subject: [PATCH] Add unarchive data to load files from date. Add pg_dump, addresses #131 at the same time. --- DOCKER-RUN.md | 6 +-- Makefile | 20 ++++++--- README.md | 2 +- docker/db.py | 20 ++++++++- docker/pgosm_flex.py | 104 +++++++++++++++++++++++++++++++++++-------- 5 files changed, 122 insertions(+), 30 deletions(-) diff --git a/DOCKER-RUN.md b/DOCKER-RUN.md index cb0290e..d2d70d7 100644 --- a/DOCKER-RUN.md +++ b/DOCKER-RUN.md @@ -20,7 +20,7 @@ your the host machine's timezone, important when for archiving PBF & MD5 files b ```bash -docker run --name pgosm -d \ +docker run --name pgosm -d --rm \ -v ~/pgosm-data:/app/output \ -v /etc/localtime:/etc/localtime:ro \ -e POSTGRES_PASSWORD=mysecretpassword \ @@ -90,7 +90,7 @@ to skip this process. To force the processing to remove existing files and re-download the latest PBF and MD5 files from Geofabrik, set the `PGOSM_ALWAYS_DOWNLOAD` env var when running the Docker container. ```bash -docker run --name pgosm -d \ +docker run --name pgosm -d --rm \ -v ~/pgosm-data:/app/output \ -e POSTGRES_PASSWORD=mysecretpassword \ -e PGOSM_ALWAYS_DOWNLOAD=1 \ @@ -105,7 +105,7 @@ to customize Postgres' configuration at run-time in Docker. ```bash -docker run --name pgosm -d \ +docker run --name pgosm -d --rm \ -v ~/pgosm-data:/app/output \ -v /etc/localtime:/etc/localtime:ro \ -e POSTGRES_PASSWORD=mysecretpassword \ diff --git a/Makefile b/Makefile index fe5c0a3..bc49ed5 100644 --- a/Makefile +++ b/Makefile @@ -27,16 +27,13 @@ build-run-docker: pgosm:/app/output/district-of-columbia-$(TODAY).osm.pbf docker cp tests/data/district-of-columbia-2021-01-13.osm.pbf.md5 \ pgosm:/app/output/district-of-columbia-$(TODAY).osm.pbf.md5 - # TODO this double copy should not be needed, once the python script - # moves the files - docker cp tests/data/district-of-columbia-2021-01-13.osm.pbf \ - pgosm:/app/output/district-of-columbia-latest.osm.pbf - docker cp tests/data/district-of-columbia-2021-01-13.osm.pbf.md5 \ - pgosm:/app/output/district-of-columbia-latest.osm.pbf.md5 # allow files created in later step to be created docker exec -it pgosm \ chown $(CURRENT_UID):$(CURRENT_GID) /app/output/ + # Needed for unit-tests + docker exec -it pgosm \ + chown $(CURRENT_UID):$(CURRENT_GID) /app/docker/ docker exec -it \ -e POSTGRES_PASSWORD=mysecretpassword \ @@ -46,4 +43,13 @@ build-run-docker: --layerset=run-all \ --ram=8 \ --region=north-america/us \ - --subregion=district-of-columbia \ No newline at end of file + --subregion=district-of-columbia + + +.PHONY: unit-tests +unit-tests: + docker exec -it \ + -e POSTGRES_PASSWORD=mysecretpassword \ + -e POSTGRES_USER=postgres \ + -u $(CURRENT_UID):$(CURRENT_GID) \ + pgosm /bin/bash -c "cd docker && coverage run -m unittest tests/*.py" diff --git a/README.md b/README.md index 59ad19a..587a08d 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Start the `pgosm` Docker container. At this point, PostgreSQL / PostGIS is available on port `5433`. ```bash -docker run --name pgosm -d \ +docker run --name pgosm -d --rm \ -v ~/pgosm-data:/app/output \ -v /etc/localtime:/etc/localtime:ro \ -e POSTGRES_PASSWORD=mysecretpassword \ diff --git a/docker/db.py b/docker/db.py index e18e19b..afcbb5f 100644 --- a/docker/db.py +++ b/docker/db.py @@ -113,7 +113,7 @@ def connection_string(db_name): try: pg_pass = os.environ['POSTGRES_PASSWORD'] except KeyError: - LOGGER.debug('POSTGRES_PASSWORD not configured. Might work if peer or ~/.pgpass is used.') + LOGGER.debug('POSTGRES_PASSWORD not configured. Should work if ~/.pgpass is configured.') pg_pass = None if pg_pass is None: @@ -165,6 +165,7 @@ def pgosm_nested_admin_polygons(paths): ---------------------- paths : dict """ + LOGGER.warning('MISSING - Make nested admin polygons optional!') sql_raw = 'CALL osm.build_nested_admin_polygons();' conn_string = connection_string(db_name='pgosm') @@ -177,3 +178,20 @@ def pgosm_nested_admin_polygons(paths): check=True) LOGGER.info(f'Nested polygon output: \n {output.stderr}') + +def run_pg_dump(export_filename, out_path): + export_path = os.path.join(out_path, export_filename) + logger = logging.getLogger('pgosm-flex') + db_name = 'pgosm' + data_schema_name = 'osm' + conn_string = connection_string(db_name=db_name) + logger.info('Running pg_dump') + cmds = ['pg_dump', '-d', conn_string, + f'--schema={data_schema_name}', + '-f', export_path] + output = subprocess.run(cmds, + text=True, + capture_output=True, + check=False) + LOGGER.info(f'pg_dump output: \n {output.stderr}') + diff --git a/docker/pgosm_flex.py b/docker/pgosm_flex.py index e92a4d6..71dd610 100644 --- a/docker/pgosm_flex.py +++ b/docker/pgosm_flex.py @@ -79,7 +79,8 @@ def run_pgosm_flex(layerset, ram, region, subregion, pgosm_date, run_post_processing(layerset=layerset, paths=paths, skip_nested=skip_nested) - run_pg_dump() + export_filename = get_export_filename(region, subregion) + db.run_pg_dump(export_filename, out_path=paths['out_path']) def setup_logger(log_file, debug): @@ -139,6 +140,10 @@ def get_paths(base_path): Creates `out_path` used for logs and data if necessary. + Parameters + ------------------- + base_path : str + Returns ------------------- paths : dict @@ -164,15 +169,37 @@ def get_region_filename(region, subregion): Returns ---------------------- - region_filename : str + filename : str """ base_name = '{}-latest.osm.pbf' if subregion == None: - region_filename = base_name.format(region) + filename = base_name.format(region) else: - region_filename = base_name.format(subregion) + filename = base_name.format(subregion) - return region_filename + return filename + + +def get_export_filename(region, subregion): + """Returns the .sql filename to use from pg_dump. + + Parameters + ---------------------- + region : str + subregion : str + + Returns + ---------------------- + filename : str + """ + region = region.replace('/', '-') + subregion = subregion.replace('/', '-') + if subregion == None: + filename = f'pgosm-flex-{region}.sql' + else: + filename = f'pgosm-flex-{region}-{subregion}.sql' + + return filename def get_pbf_url(region, subregion): @@ -265,19 +292,24 @@ def prepare_data(region, subregion, pgosm_date, paths): if pbf_download_needed(pbf_file_with_date, md5_file_with_date): logging.getLogger('pgosm-flex').info('Downloading PBF and MD5 files...') download_data(region, subregion, pbf_file, md5_file) + archive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date) else: - logging.getLogger('pgosm-flex').warning('MISSING - Need to copy archived files to -latest filenames!') + logging.getLogger('pgosm-flex').info('Copying Archived files') + unarchive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date) verify_checksum(md5_file, paths) - archive_data(pbf_file, md5_file, - pbf_file_with_date, md5_file_with_date) - return pbf_file def pbf_download_needed(pbf_file_with_date, md5_file_with_date): - """ + """Decides if the PBF/MD5 files need to be downloaded. + + Parameters + ------------------------------- + pbf_file_with_date : str + md5_file_with_date : str + Returns -------------------------- download_needed : bool @@ -287,7 +319,6 @@ def pbf_download_needed(pbf_file_with_date, md5_file_with_date): if os.path.exists(pbf_file_with_date): logger.info(f'PBF File exists {pbf_file_with_date}') - if os.path.exists(md5_file_with_date): logger.info('PBF & MD5 files exist. Download not needed') download_needed = False @@ -296,15 +327,16 @@ def pbf_download_needed(pbf_file_with_date, md5_file_with_date): print('PBF for today available but not MD5... download needed') download_needed = True else: - err = 'Cannot validate historic PBF file. Exiting' + err = 'Missing MD5 file. Cannot validate.' logger.error(err) - sys.exit(err) + raise FileNotFoundError(err) else: logger.info('PBF file not found locally. Download required') download_needed = True return download_needed + def download_data(region, subregion, pbf_file, md5_file): logger = logging.getLogger('pgosm-flex') logger.info(f'Downloading PBF data to {pbf_file}') @@ -341,8 +373,19 @@ def verify_checksum(md5_file, paths): return cmd -def archive_data(pbf_file, md5_file, - pbf_file_with_date, md5_file_with_date): +def archive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date): + """Copies `pbf_file` and `md5_file` to `pbf_file_with_date` and + `md5_file_with_date`. + + If either file exists, does nothing. + + Parameters + -------------------------------- + pbf_file : str + md5_file : str + pbf_file_with_date : str + md5_file_with_date : str + """ if os.path.exists(pbf_file_with_date): pass # Do nothing else: @@ -354,6 +397,34 @@ def archive_data(pbf_file, md5_file, shutil.copy2(md5_file, md5_file_with_date) +def unarchive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date): + """Copies `pbf_file_with_date` and `md5_file_with_date` + to `pbf_file` and `md5_file`. + + Always copies, will overwrite a -latest file if it is in the way. + + Parameters + -------------------------------- + pbf_file : str + md5_file : str + pbf_file_with_date : str + md5_file_with_date : str + """ + logger = logging.getLogger('pgosm-flex') + if os.path.exists(pbf_file): + logger.debug(f'{pbf_file} exists. Overwriting.') + + logger.info(f'Copying {pbf_file_with_date} to {pbf_file}') + shutil.copy2(pbf_file_with_date, pbf_file) + + if os.path.exists(md5_file): + logger.debug(f'{md5_file} exists. Overwriting.') + + logger.info(f'Copying {md5_file_with_date} to {md5_file}') + shutil.copy2(md5_file_with_date, md5_file) + + + def get_osm2pgsql_command(region, subregion, ram, layerset, paths): """Returns recommended osm2pgsql command. @@ -424,9 +495,6 @@ def run_post_processing(layerset, paths, skip_nested): db.pgosm_nested_admin_polygons(paths) -def run_pg_dump(): - logging.getLogger('pgosm-flex').warning('MISSING - run pg_dump') - if __name__ == "__main__": logging.getLogger('pgosm-flex').info('Running PgOSM Flex!')