Skip to content

Commit

Permalink
Add unarchive data to load files from date. Add pg_dump, addresses #131
Browse files Browse the repository at this point in the history
… at the same time.
  • Loading branch information
rustprooflabs committed Aug 10, 2021
1 parent 824d086 commit 13aa85b
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 30 deletions.
6 changes: 3 additions & 3 deletions DOCKER-RUN.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ your the host machine's timezone, important when for archiving PBF & MD5 files b


```bash
docker run --name pgosm -d \
docker run --name pgosm -d --rm \
-v ~/pgosm-data:/app/output \
-v /etc/localtime:/etc/localtime:ro \
-e POSTGRES_PASSWORD=mysecretpassword \
Expand Down Expand Up @@ -90,7 +90,7 @@ to skip this process.
To force the processing to remove existing files and re-download the latest PBF and MD5 files from Geofabrik, set the `PGOSM_ALWAYS_DOWNLOAD` env var when running the Docker container.

```bash
docker run --name pgosm -d \
docker run --name pgosm -d --rm \
-v ~/pgosm-data:/app/output \
-e POSTGRES_PASSWORD=mysecretpassword \
-e PGOSM_ALWAYS_DOWNLOAD=1 \
Expand All @@ -105,7 +105,7 @@ to customize Postgres' configuration at run-time in Docker.


```bash
docker run --name pgosm -d \
docker run --name pgosm -d --rm \
-v ~/pgosm-data:/app/output \
-v /etc/localtime:/etc/localtime:ro \
-e POSTGRES_PASSWORD=mysecretpassword \
Expand Down
20 changes: 13 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,13 @@ build-run-docker:
pgosm:/app/output/district-of-columbia-$(TODAY).osm.pbf
docker cp tests/data/district-of-columbia-2021-01-13.osm.pbf.md5 \
pgosm:/app/output/district-of-columbia-$(TODAY).osm.pbf.md5
# TODO this double copy should not be needed, once the python script
# moves the files
docker cp tests/data/district-of-columbia-2021-01-13.osm.pbf \
pgosm:/app/output/district-of-columbia-latest.osm.pbf
docker cp tests/data/district-of-columbia-2021-01-13.osm.pbf.md5 \
pgosm:/app/output/district-of-columbia-latest.osm.pbf.md5

# allow files created in later step to be created
docker exec -it pgosm \
chown $(CURRENT_UID):$(CURRENT_GID) /app/output/
# Needed for unit-tests
docker exec -it pgosm \
chown $(CURRENT_UID):$(CURRENT_GID) /app/docker/

docker exec -it \
-e POSTGRES_PASSWORD=mysecretpassword \
Expand All @@ -46,4 +43,13 @@ build-run-docker:
--layerset=run-all \
--ram=8 \
--region=north-america/us \
--subregion=district-of-columbia
--subregion=district-of-columbia


.PHONY: unit-tests
unit-tests:
docker exec -it \
-e POSTGRES_PASSWORD=mysecretpassword \
-e POSTGRES_USER=postgres \
-u $(CURRENT_UID):$(CURRENT_GID) \
pgosm /bin/bash -c "cd docker && coverage run -m unittest tests/*.py"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Start the `pgosm` Docker container. At this point, PostgreSQL / PostGIS
is available on port `5433`.

```bash
docker run --name pgosm -d \
docker run --name pgosm -d --rm \
-v ~/pgosm-data:/app/output \
-v /etc/localtime:/etc/localtime:ro \
-e POSTGRES_PASSWORD=mysecretpassword \
Expand Down
20 changes: 19 additions & 1 deletion docker/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def connection_string(db_name):
try:
pg_pass = os.environ['POSTGRES_PASSWORD']
except KeyError:
LOGGER.debug('POSTGRES_PASSWORD not configured. Might work if peer or ~/.pgpass is used.')
LOGGER.debug('POSTGRES_PASSWORD not configured. Should work if ~/.pgpass is configured.')
pg_pass = None

if pg_pass is None:
Expand Down Expand Up @@ -165,6 +165,7 @@ def pgosm_nested_admin_polygons(paths):
----------------------
paths : dict
"""
LOGGER.warning('MISSING - Make nested admin polygons optional!')
sql_raw = 'CALL osm.build_nested_admin_polygons();'

conn_string = connection_string(db_name='pgosm')
Expand All @@ -177,3 +178,20 @@ def pgosm_nested_admin_polygons(paths):
check=True)
LOGGER.info(f'Nested polygon output: \n {output.stderr}')


def run_pg_dump(export_filename, out_path):
export_path = os.path.join(out_path, export_filename)
logger = logging.getLogger('pgosm-flex')
db_name = 'pgosm'
data_schema_name = 'osm'
conn_string = connection_string(db_name=db_name)
logger.info('Running pg_dump')
cmds = ['pg_dump', '-d', conn_string,
f'--schema={data_schema_name}',
'-f', export_path]
output = subprocess.run(cmds,
text=True,
capture_output=True,
check=False)
LOGGER.info(f'pg_dump output: \n {output.stderr}')

104 changes: 86 additions & 18 deletions docker/pgosm_flex.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ def run_pgosm_flex(layerset, ram, region, subregion, pgosm_date,
run_post_processing(layerset=layerset, paths=paths,
skip_nested=skip_nested)

run_pg_dump()
export_filename = get_export_filename(region, subregion)
db.run_pg_dump(export_filename, out_path=paths['out_path'])


def setup_logger(log_file, debug):
Expand Down Expand Up @@ -139,6 +140,10 @@ def get_paths(base_path):
Creates `out_path` used for logs and data if necessary.
Parameters
-------------------
base_path : str
Returns
-------------------
paths : dict
Expand All @@ -164,15 +169,37 @@ def get_region_filename(region, subregion):
Returns
----------------------
region_filename : str
filename : str
"""
base_name = '{}-latest.osm.pbf'
if subregion == None:
region_filename = base_name.format(region)
filename = base_name.format(region)
else:
region_filename = base_name.format(subregion)
filename = base_name.format(subregion)

return region_filename
return filename


def get_export_filename(region, subregion):
"""Returns the .sql filename to use from pg_dump.
Parameters
----------------------
region : str
subregion : str
Returns
----------------------
filename : str
"""
region = region.replace('/', '-')
subregion = subregion.replace('/', '-')
if subregion == None:
filename = f'pgosm-flex-{region}.sql'
else:
filename = f'pgosm-flex-{region}-{subregion}.sql'

return filename


def get_pbf_url(region, subregion):
Expand Down Expand Up @@ -265,19 +292,24 @@ def prepare_data(region, subregion, pgosm_date, paths):
if pbf_download_needed(pbf_file_with_date, md5_file_with_date):
logging.getLogger('pgosm-flex').info('Downloading PBF and MD5 files...')
download_data(region, subregion, pbf_file, md5_file)
archive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date)
else:
logging.getLogger('pgosm-flex').warning('MISSING - Need to copy archived files to -latest filenames!')
logging.getLogger('pgosm-flex').info('Copying Archived files')
unarchive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date)

verify_checksum(md5_file, paths)

archive_data(pbf_file, md5_file,
pbf_file_with_date, md5_file_with_date)

return pbf_file


def pbf_download_needed(pbf_file_with_date, md5_file_with_date):
"""
"""Decides if the PBF/MD5 files need to be downloaded.
Parameters
-------------------------------
pbf_file_with_date : str
md5_file_with_date : str
Returns
--------------------------
download_needed : bool
Expand All @@ -287,7 +319,6 @@ def pbf_download_needed(pbf_file_with_date, md5_file_with_date):
if os.path.exists(pbf_file_with_date):
logger.info(f'PBF File exists {pbf_file_with_date}')


if os.path.exists(md5_file_with_date):
logger.info('PBF & MD5 files exist. Download not needed')
download_needed = False
Expand All @@ -296,15 +327,16 @@ def pbf_download_needed(pbf_file_with_date, md5_file_with_date):
print('PBF for today available but not MD5... download needed')
download_needed = True
else:
err = 'Cannot validate historic PBF file. Exiting'
err = 'Missing MD5 file. Cannot validate.'
logger.error(err)
sys.exit(err)
raise FileNotFoundError(err)
else:
logger.info('PBF file not found locally. Download required')
download_needed = True

return download_needed


def download_data(region, subregion, pbf_file, md5_file):
logger = logging.getLogger('pgosm-flex')
logger.info(f'Downloading PBF data to {pbf_file}')
Expand Down Expand Up @@ -341,8 +373,19 @@ def verify_checksum(md5_file, paths):
return cmd


def archive_data(pbf_file, md5_file,
pbf_file_with_date, md5_file_with_date):
def archive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date):
"""Copies `pbf_file` and `md5_file` to `pbf_file_with_date` and
`md5_file_with_date`.
If either file exists, does nothing.
Parameters
--------------------------------
pbf_file : str
md5_file : str
pbf_file_with_date : str
md5_file_with_date : str
"""
if os.path.exists(pbf_file_with_date):
pass # Do nothing
else:
Expand All @@ -354,6 +397,34 @@ def archive_data(pbf_file, md5_file,
shutil.copy2(md5_file, md5_file_with_date)


def unarchive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date):
"""Copies `pbf_file_with_date` and `md5_file_with_date`
to `pbf_file` and `md5_file`.
Always copies, will overwrite a -latest file if it is in the way.
Parameters
--------------------------------
pbf_file : str
md5_file : str
pbf_file_with_date : str
md5_file_with_date : str
"""
logger = logging.getLogger('pgosm-flex')
if os.path.exists(pbf_file):
logger.debug(f'{pbf_file} exists. Overwriting.')

logger.info(f'Copying {pbf_file_with_date} to {pbf_file}')
shutil.copy2(pbf_file_with_date, pbf_file)

if os.path.exists(md5_file):
logger.debug(f'{md5_file} exists. Overwriting.')

logger.info(f'Copying {md5_file_with_date} to {md5_file}')
shutil.copy2(md5_file_with_date, md5_file)



def get_osm2pgsql_command(region, subregion, ram, layerset, paths):
"""Returns recommended osm2pgsql command.
Expand Down Expand Up @@ -424,9 +495,6 @@ def run_post_processing(layerset, paths, skip_nested):
db.pgosm_nested_admin_polygons(paths)


def run_pg_dump():
logging.getLogger('pgosm-flex').warning('MISSING - run pg_dump')


if __name__ == "__main__":
logging.getLogger('pgosm-flex').info('Running PgOSM Flex!')
Expand Down

0 comments on commit 13aa85b

Please sign in to comment.