Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code restructuring, cleanup, tests #206

Merged
merged 2 commits into from
Nov 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ unit-tests: ## Runs Python unit tests and data import tests
-e POSTGRES_PASSWORD=mysecretpassword \
-e POSTGRES_USER=postgres \
-u $(CURRENT_UID):$(CURRENT_GID) \
pgosm /bin/bash -c "cd docker && coverage run -m unittest tests/*.py"
pgosm /bin/bash -c "cd docker && coverage run -m unittest tests/*.py && coverage report -m ./*.py"

# Data import tests
docker cp tests \
Expand Down
239 changes: 239 additions & 0 deletions docker/geofabrik.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
"""This module handles the auto-file using Geofabrik's download service.
"""
import logging
import os
import shutil
import subprocess

import helpers


def get_region_filename(region, subregion):
"""Returns the filename needed to download/manage PBF files.

Parameters
----------------------
region : str
subregion : str

Returns
----------------------
filename : str
"""
base_name = '{}-latest.osm.pbf'
if subregion is None:
filename = base_name.format(region)
else:
filename = base_name.format(subregion)

return filename


def prepare_data(region, subregion, pgosm_date, paths):
"""Ensures the PBF file is available.

Checks if it already exists locally, download if needed,
and verify MD5 checksum.

Parameters
----------------------
region : str
subregion : str
pgosm_date : str
paths : dict

Returns
----------------------
pbf_file : str
Full path to PBF file
"""
out_path = paths['out_path']
pbf_filename = get_region_filename(region, subregion)

pbf_file = os.path.join(out_path, pbf_filename)
pbf_file_with_date = pbf_file.replace('latest', pgosm_date)

md5_file = f'{pbf_file}.md5'
md5_file_with_date = f'{pbf_file_with_date}.md5'

if pbf_download_needed(pbf_file_with_date, md5_file_with_date, pgosm_date):
logging.getLogger('pgosm-flex').info('Downloading PBF and MD5 files...')
download_data(region, subregion, pbf_file, md5_file)
archive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date)
else:
logging.getLogger('pgosm-flex').info('Copying Archived files')
unarchive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date)

helpers.verify_checksum(md5_file, paths['out_path'])

return pbf_file


def pbf_download_needed(pbf_file_with_date, md5_file_with_date, pgosm_date):
"""Decides if the PBF/MD5 files need to be downloaded.

Parameters
-------------------------------
pbf_file_with_date : str
md5_file_with_date : str

Returns
--------------------------
download_needed : bool
"""
logger = logging.getLogger('pgosm-flex')
# If the PBF file exists, check for the MD5 file too.
if os.path.exists(pbf_file_with_date):
logger.info(f'PBF File exists {pbf_file_with_date}')

if os.path.exists(md5_file_with_date):
logger.info('PBF & MD5 files exist. Download not needed')
download_needed = False
else:
if pgosm_date == helpers.get_today():
print('PBF for today available but not MD5... download needed')
download_needed = True
else:
err = f'Missing MD5 file for {pgosm_date}. Cannot validate.'
logger.error(err)
raise FileNotFoundError(err)
else:
if not pgosm_date == helpers.get_today():
err = f'Missing PBF file for {pgosm_date}. Cannot proceed.'
logger.error(err)
raise FileNotFoundError(err)

logger.info('PBF file not found locally. Download required')
download_needed = True

return download_needed


def get_pbf_url(region, subregion):
"""Returns the URL to the PBF for the region / subregion.

Parameters
----------------------
region : str
subregion : str

Returns
----------------------
pbf_url : str
"""
base_url = 'https://download.geofabrik.de'

if subregion is None:
pbf_url = f'{base_url}/{region}-latest.osm.pbf'
else:
pbf_url = f'{base_url}/{region}/{subregion}-latest.osm.pbf'

return pbf_url


def download_data(region, subregion, pbf_file, md5_file):
"""Downloads PBF and MD5 file using wget.

Parameters
---------------------
region : str
subregion : str
pbf_file : str
md5_file : str
"""
logger = logging.getLogger('pgosm-flex')
logger.info(f'Downloading PBF data to {pbf_file}')
pbf_url = get_pbf_url(region, subregion)

subprocess.run(
['/usr/bin/wget', pbf_url,
"-O", pbf_file , "--quiet"
],
capture_output=True,
text=True,
check=True
)

logger.info(f'Downloading MD5 checksum to {md5_file}')
subprocess.run(
['/usr/bin/wget', f'{pbf_url}.md5',
"-O", md5_file , "--quiet"
],
capture_output=True,
text=True,
check=True
)


def archive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date):
"""Copies `pbf_file` and `md5_file` to `pbf_file_with_date` and
`md5_file_with_date`.

If either file exists, does nothing.

Parameters
--------------------------------
pbf_file : str
md5_file : str
pbf_file_with_date : str
md5_file_with_date : str
"""
if os.path.exists(pbf_file_with_date):
pass # Do nothing
else:
shutil.copy2(pbf_file, pbf_file_with_date)

if os.path.exists(md5_file_with_date):
pass # Do nothing
else:
shutil.copy2(md5_file, md5_file_with_date)



def unarchive_data(pbf_file, md5_file, pbf_file_with_date, md5_file_with_date):
"""Copies `pbf_file_with_date` and `md5_file_with_date`
to `pbf_file` and `md5_file`.

Always copies, will overwrite a -latest file if it is in the way.

Parameters
--------------------------------
pbf_file : str
md5_file : str
pbf_file_with_date : str
md5_file_with_date : str
"""
logger = logging.getLogger('pgosm-flex')
if os.path.exists(pbf_file):
logger.debug(f'{pbf_file} exists. Overwriting.')

logger.info(f'Copying {pbf_file_with_date} to {pbf_file}')
shutil.copy2(pbf_file_with_date, pbf_file)

if os.path.exists(md5_file):
logger.debug(f'{md5_file} exists. Overwriting.')

logger.info(f'Copying {md5_file_with_date} to {md5_file}')
shutil.copy2(md5_file_with_date, md5_file)


def remove_latest_files(region, subregion, paths):
"""Removes the PBF and MD5 file with -latest in the name.

Files are archived via prepare_data() before processing starts

Parameters
-------------------------
region : str
subregion : str
paths : dict
"""
pbf_filename = get_region_filename(region, subregion)

pbf_file = os.path.join(paths['out_path'], pbf_filename)
md5_file = f'{pbf_file}.md5'
logging.info(f'Done with {pbf_file}, removing.')
os.remove(pbf_file)
logging.info(f'Done with {md5_file}, removing.')
os.remove(md5_file)

30 changes: 30 additions & 0 deletions docker/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Generic functions used in multiple modules.
"""
import datetime
import subprocess


def get_today():
"""Returns yyyy-mm-dd formatted string for today.

Retunrs
-------------------------
today : str
"""
today = datetime.datetime.today().strftime('%Y-%m-%d')
return today


def verify_checksum(md5_file, out_path):
"""If verfication fails, raises `CalledProcessError`

Parameters
---------------------
md5_file : str
out_path : str
"""
subprocess.run(['md5sum', '-c', md5_file],
capture_output=True,
text=True,
check=True,
cwd=out_path)
Loading