From c8e8c9fa59f058a0cbd2a5c6bb293bb4a1f11970 Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Tue, 9 Jul 2024 10:31:28 -0400 Subject: [PATCH] Added change log details and DHIS2 info in settings config file (#3) * Added change log details and DHIS2 info in settings config file * Mock calls to the DHIS2 server in unit tests --- CHANGELOG.md | 29 +--- LICENSE.md | 2 +- README.md | 219 +-------------------------- data/.gitignore | 2 +- pyproject.toml | 5 +- settings.ini | 4 + src/msfocr/data/__init__.py | 0 src/msfocr/data/data_upload_DHIS2.py | 25 ++- src/msfocr/docTR/ocr_functions.py | 6 +- tests/conftest.py | 27 +++- tests/test_ocr_functions.py | 10 +- 11 files changed, 75 insertions(+), 254 deletions(-) create mode 100644 settings.ini create mode 100644 src/msfocr/data/__init__.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d00ecbc..159c109 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,28 +6,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 You should also add project tags for each release in Github, see [Managing releases in a repository](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository). -## [2.0.0] - 2024-05-29 +## [0.0.1] - 2024-07-03 ### Added -- Added example auto-built Sphinx documentation in the `docs` folder -- Github workflow for running ruff linter -- A note about conda dependencies to README -- A note about using docker containers to README -- Ruff as a linter for development -### Changed -- All build and packaging switched to use only pyproject.toml -- Minimum python version changed to 3.10 -- Github workflow checks python versions 3.10, 3.11, 3.12 -- Updated DVC version to avoid `ImportError: cannot import name 'fsspec_loop'` in older versions -### Removed -- Removed setup.cfg - -## [1.0.0] - 2022-05-23 -### Added -- README and CHANGELOG -- cdstemplate packages for computing word count from input text -- corpus_counter_script.py as a user-facing script with argparse examples -- Tests of cdstemplate packages -- A github workflow to trigger tests on pull request to the main branch -- Sample text data from Project Gutenberg -- Data Version Control stage for the corpus_counter_script.py -- A sample Jupyter notebook that plots most frequent words the Gutenberg data +- Notebook for downloading sample test data +- msfocr.data.data_upload_DHIS2 created for sending key/value pairs to a DHIS2 server +- msfocr.docTR created to implement extracting tables from images using image2table and docTR +- Initial package structure created diff --git a/LICENSE.md b/LICENSE.md index 00a65ef..e319bb7 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 University of Massachusetts Amherst, Center for Data Science +Copyright (c) 2024 University of Massachusetts Amherst, Center for Data Science Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 444af5e..b7e0420 100644 --- a/README.md +++ b/README.md @@ -20,40 +20,8 @@ Navigate to your project directory in the terminal and run the following command To activate the virtual environment, use `venv\Scripts\activate` on Windows or `source venv/bin/activate` on Unix or MacOS. To install the package, run `pip install .` for regular use or `pip install -e .[test,dev]` for development. Ps use `pip install -e '.[test,dev]'` for zsh. - -For example, if you use Conda, you would run the following to create an environment named `template` with python version 3.10, then activate it and install the package in developer mode: -``` -$ conda create -n template python=3.10 -y -Collecting package metadata (current_repodata.json): done -Solving environment: done - -## Package Plan ## - - environment location: /home/virginia/miniconda3/envs/template - - added / updated specs: - - python=3.10 - - - -The following NEW packages will be INSTALLED: - - package | build - ---------------------------|----------------- -... -$ conda activate `template` -$ pip install -e .[test,dev] -Obtaining file:///home/virginia/workspace/PythonProjectTemplate - Installing build dependencies ... done - Getting requirements to build wheel ... done - Installing backend dependencies ... done - Preparing wheel metadata ... done -Collecting numpy -... -``` - -## Instructions for Downloading data from azure +## Instructions for Downloading data from Azure This part demonstrates how to interact with Azure services to download blobs from Azure Blob Storage. First, launch Jupyter Notebook: @@ -98,187 +66,10 @@ list_blobs_in_container(storage_account_name, storage_account_key, container_nam download_blobs_in_container(storage_account_name, storage_account_key, container_name) ``` - - - - -## Specifying Requirements -In order for users to install your package and all the libraries it depends on by running `pip install`, you need to provide a `pyproject.toml` file. This has two important sections: -- `project`: List project metadata and version information and all library requirements/dependencies, including for testing or development environments. This is the main file you will work with and add requirements to. Some dependencies -- `build-system`: Define the build tool that is used to package and distribute your code. For this project, we use [SetupTools](https://setuptools.pypa.io/en/latest/userguide/quickstart.html). - -If you'd like to learn more about python packaging, refer to [the Python Packaging User Guide](https://packaging.python.org/en/latest/) or [PEP 517](https://peps.python.org/pep-0517/#build-requirements). - -### Requirements via conda environment files -[Anaconda](https://www.anaconda.com/download/) and its bare bones counterpart, [Miniconda](https://docs.anaconda.com/free/miniconda/index.html), are especially useful if your project depends on libraries that are difficult to install in the standard pythonic way, such as [GPU libraries](https://docs.anaconda.com/free/working-with-conda/packages/gpu-packages/). If this is the case, you should also share a [Conda environment file](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-file-manually) with your code. The conda virtual environment will need to be created and activated before any `pip install` steps. Installations with conda dependencies are usually a little more complicated, so make sure you include step-by-step instructions in documentation. - -### Containerized applications -In cases when its important that your software work exactly the same on every operating system or you want to abstract away difficult installation steps for end user, you can consider creating a [Docker container](https://www.docker.com/resources/what-container/). This is often appropriate deploying services in the cloud or providing an application for a tech-savvy person to use on their own. However, it's not necessary for most of our projects. - - -## Directory Structure -So what does each file in this repository do? -``` -. -├── src - ├── cdstemplate # The python package root - Any code you'd like to be able to import lives here -   ├── corpus_counter_script.py # A script that takes a list of documents as input and outputs a CSV of word counts -   ├── __init__.py # Indicates that this directory is a python package, you can put special import instructions here -    ├── word_count.py # A module that has functions and classes to import -   └── utils.py # A module that handles logging and other internals -├── CHANGELOG.md # Versioning information -├── dag_workflow.png # An image that is linked to in this README -├── data # Data files which may or may not be tracked in Git, but we reserve a folder for them so that users can all have the same relative paths -   ├── gutenberg # Sample text input files, the raw inputs to our experiment pipeline. -   └── gutenberg_counts.csv # The expected output file for our experiment. It's generated by `dvc repro` and is ignored by git. -├── docs # Sphinx auto-documentation uses this folder to run its scripts and store documentation -   ├── _build # Contains the Sphinx doctree and html documentation source code - ├── doctrees # A folder with doctree construction information -   └── html # A folder that contains the html code for all automatically created documentation -   ├── _static # A folder that can contain static code -   ├── _templates # A folder that can contain Sphinx templates - ├── conf.py # A function that configures Sphinx according to user specifications - ├── index.rst # A directory that users can input new functions into for auto-documentation - ├── make.bat # A function that runs auto-documentation - └── Makefile # A function that creates html documentation based on functions in the index.rst file -├── dvc.lock # Data Version Control uses this file to compare experiment versions. It's tracked in Git, but don't edit it manually. -├── dvc.yaml # Create the Data Version Control pipeline stages here -├── notebooks -   └── word_count_prototype.ipynb # A jupyter notebook that makes pretty plots -├── pyproject.toml # Project metadata, dependencies and build tools are declared for proper installation and packaging. -├── README.md # You're reading it now! -└── tests - └── test_word_count.py # Unit and smoke tests for the word_count module -├── .dvc # The configuration file for Data Version Control -├── .github - └── workflows/python_package.yml # Github Workflow file, configures running tests on Github every time a pull request to the main branch is made -├── .gitignore # Lists files that should not be included in version control, created from Github's template .gitignore for Python. -└── .dvcignore # Lists files that Data Version Control should skip when checking for changes in stage dependencies. -``` - - -# Communication Tools and Code -When you work with others, it's not just about the code! - -The README, CHANGELOG and docstrings are just as important. - -- _README.md_ : Summarize the project's purpose and give installation instructions. -- _CHANGELOG.md_ : Tell the user what has changed between versions and why, see [Keep A CHANGELOG](https://keepachangelog.com/en/1.0.0/) -- docstrings: Appear directly in your code and give an overview of each function or object. They can be printed using `help(object)` from the python interpreter or used to automatically generate API documentation with a tool like [Sphinx](https://www.sphinx-doc.org/en/master/index.html). There are many different docstring formats. Your team can choose any they like, just be consistent. This template uses [reStructuredText style](https://peps.python.org/pep-0287/). -- Sphinx : Create html documentation for your functions based on the docstrings you write in the code. Use [Sphinx](https://www.sphinx-doc.org/en/master/index.html) to streamline the documentation process. - -Read [Real Python's Documenting Python Code: A Complete Guide](https://realpython.com/documenting-python-code/) for more ideas about effectively documenting code. The `.md` files are written using [Markdown](https://www.markdownguide.org/), a handy formatting language that is automatically rendered in Github. +## Uploading data to a DHIS2 server +This repository assumes assumes you will eventually want to upload data extracted from form images to a [DHIS2 health information server](https://dhis2.org/). In order to configure your connection to the DHIS2, you should fill in the server endpoint and your username and password in the `settings.ini` file. # Tests -Although it's [impossible to generally prove that your code is bug-free](https://en.wikipedia.org/wiki/Undecidable_problem), automated testing is a valuable tool. It provides: -- Proof that your code works as intended in most common examples and important edge cases -- Instant feedback on whether changes to the code broke its functionality -- Examples of how to use the code, a type of documentation - -This repository has tests configured using [pytest](https://pytest.org/) and the Github action defined in `.github/workflows/python_package.yml` will run tests every time you make a pull request to the main branch of the repository. [Unittest](https://docs.python.org/3/library/unittest.html#module-unittest) and [nose2](https://docs.nose2.io/en/latest/) are other common test frameworks for python. - -You can run tests locally using `pytest` or `python -m pytest` from the command line from the root of the repository or configure them to be [run with a debugger in your IDE](https://code.visualstudio.com/docs/python/testing). For example: -``` -$ pytest -======================== test session starts ======================== -platform linux -- Python 3.10.4, pytest-7.1.2, pluggy-1.0.0 -rootdir: /home/virginia/workspace/PythonProjectTemplate -collected 2 items - -tests/test_sample_module.py . -``` - -Read the following articles for tips on writing your own tests: -- [Getting Started With Testing in Python](https://realpython.com/python-testing/) -- [13 Tips for Writing Useful Unit Tests](https://betterprogramming.pub/13-tips-for-writing-useful-unit-tests-ca20706b5368) -- [Why Good Developers Write Bad Unit Tests](https://mtlynch.io/good-developers-bad-tests) - -# Reproducible Experiments -In practice, data science often relies on pipelining many operations together to prepare data, extract features, then train and evaluate models or produce analysis. Whether someone can reproduce your experiments depends on how clearly you lay out the pipeline and parameters that you use for each 'node' in the pipeline, including stating where to find the input data and how it should be formatted. - -In practice, you should write scripts that are flexible enough to change the parameters you'd like to experiment with and define the pipeline using a directed acyclic graph (DAG), where the outputs from earlier steps become the dependencies for later ones. It's good practice to draw out the DAG for your experiment first, noting inputs, outputs and parameters, before you code scripts for the pipeline, like this: - -![DAG diagram](./dag_workflow.png) - -## Reusable Scripts -Our 'experiment' here is simply counting the occurrence of words from a set of documents, in the form of text files, then writing the counts of each word to a CSV file. This operation is made available to users via the `cdstemplate.corpus_counter_script` and by using the [`argparse` command-line parsing library](https://docs.python.org/3/library/argparse.html#module-argparse), we clearly describe the expected input parameters and options, which can be displayed using the `--help` flag. There are [other command-line parsers](https://realpython.com/comparing-python-command-line-parsing-libraries-argparse-docopt-click/) you can use, but `argparse` comes with python, so you don't need to add an extra requirement. - - -Since we have made the package installable and defined it as the `corpus-counter` script in `project.toml`, users can run it using `corpus-counter`, `python -m cdstemplate.corpus_counter_script` or `python src/cdstemplate/corpus_counter_script.py`, but all work the same way: -``` -$ corpus-counter --help -usage: corpus-counter [-h] [--case-insensitive] csv documents [documents ...] - -A script to generate counts of tokens in a corpus - -positional arguments: - csv Path to the output CSV storing token counts. Required. - documents Paths to at least one raw text document that make up the corpus. Required. - -options: - -h, --help show this help message and exit - --case-insensitive, -c - Default is to have case sensitive tokenization. Use this flag to make the token counting - case insensitive. Optional. -$ python src/cdstemplate/corpus_counter_script.py --help -usage: corpus_counter_script.py [-h] [--case-insensitive] -... -$ python -m cdstemplate.corpus_counter_script --help -usage: corpus_counter_script.py [-h] [--case-insensitive] - csv documents [documents ...] - -A script to generate counts of tokens in a corpus -... -``` - -Using the help message, we can understand how to run the script to count all the words in the text files in `data/gutenberg` in a case-insensitive way, saving word counts to a new csv file, `data/gutenberg_counts.csv`: -``` -$ corpus-counter data/gutenberg_counts.csv data/gutenberg/*.txt --case-insensitive -INFO : 2023-12-08 12:26:10,770 : cdstemplate.corpus_counter_script : Command line arguments: Namespace(csv='data/gutenberg_counts.csv', documents=['data/gutenberg/austen-emma.txt', 'data/gutenberg/austen-persuasion.txt', 'data/gutenberg/austen-sense.txt', 'data/gutenberg/bible-kjv.txt', 'data/gutenberg/blake-poems.txt', 'data/gutenberg/bryant-stories.txt', 'data/gutenberg/burgess-busterbrown.txt', 'data/gutenberg/carroll-alice.txt', 'data/gutenberg/chesterton-ball.txt', 'data/gutenberg/chesterton-brown.txt', 'data/gutenberg/chesterton-thursday.txt'], case_insensitive=True) -DEBUG : 2023-12-08 12:26:10,771 : cdstemplate.word_count : CorpusCounter instantiated, tokenization pattern: \s, case insensitive: True -INFO : 2023-12-08 12:26:10,771 : cdstemplate.corpus_counter_script : Tokenizing document number 0: data/gutenberg/austen-emma.txt -DEBUG : 2023-12-08 12:26:10,771 : cdstemplate.word_count : Tokenizing '[Emma by Jane Austen 1816] -... -``` - -## Data Dependencies Tools -[Build automation tools](https://en.wikipedia.org/wiki/Build_automation) like [Make](https://en.wikipedia.org/wiki/Make_(software)) have been used to resolve dependencies and compile software since the 1970s. Build automation can also be used in data science and machine learning workflows for [many of the same reasons](https://en.wikipedia.org/wiki/Build_automation#Advantages), like eliminating redundant tasks, maintaining history and improved quality and consistency through automating processes. Using a build tool can also be a documentation and communication tool, since it declares the most common ways to run code and reproduce experiments. - -In the Machine Learning Operations (MLOps) community these automation tools are often called [task or workflow orchestration](https://www.datarevenue.com/en-blog/airflow-vs-luigi-vs-argo-vs-mlflow-vs-kubeflow). There are many options, such as [Airflow](https://airflow.apache.org/), [Luigi](https://github.com/spotify/luigi), [MLflow](https://mlflow.org/), [Kubeflow](https://www.kubeflow.org/) and [iterative.ai's DVC and CML](https://iterative.ai/), all with various additional features for versioning experiments, scheduling and visualizations, but at the core they are all built on the same dependency graph principle as the OG [Make](https://opensource.com/article/18/8/what-how-makefile). - -Some of these tools can take a lot of work to set up, so discuss the trade-offs with your team to decide what you'd like to use. In the early stages of a project, we recommend using something easy to set up, like [DVC](https://dvc.org/) or [Make](https://opensource.com/article/18/8/what-how-makefile). - -### DVC Example -In this repository, we have set up a pipeline using [DVC](https://dvc.org/), which has the added benefit of versioning data and experiments. DVC is especially easy to set up for Python projects, because it can be installed via pip in the project requirements and integrates with git. See [DVC Get Started documentation](https://dvc.org/doc/start) for instructions on setting up DVC in your own repository. - -The stages in our word count experiment pipeline are configured in `dvc.yaml`. As described in the previous section, this takes the `data/gutenberg` files as input and produces `data/gutenberg_counts.csv` as the final product. Since `data/gutenberg_counts.csv` should be generated whenever the data or scripts change, it is managed by DVC and ignored by git. You can re-run the pipeline steps by running `dvc repro`. -``` -$ dvc repro -Running stage 'count-words': -> python cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg/*.txt --case-insensitive -INFO : 2022-05-23 11:18:42,813 : __main__ : Command line arguments: Namespace(csv='data/gutenberg_counts.csv', documents=['data/gutenberg/austen-emma.txt', 'data/gutenberg/austen-persuasion.txt', 'data/gutenberg/austen-sense.txt', 'data/gutenberg/bible-kjv.txt', 'data/gutenberg/blake-poems.txt', 'data/gutenberg/bryant-stories.txt', 'data/gutenberg/burgess-busterbrown.txt', 'data/gutenberg/carroll-alice.txt', 'data/gutenberg/chesterton-ball.txt', 'data/gutenberg/chesterton-brown.txt', 'data/gutenberg/chesterton-thursday.txt'], case_insensitive=True) -... -$ dvc repro -Stage 'count-words' didn't change, skipping -Data and pipelines are up to date. -``` - - -You can see the stages in the DAG by running `dvc dag`, in our case it's just a single step called `count-words`: -``` -$ dvc dag -+-------------+ -| count-words | -+-------------+ -``` - -## A Note on Notebooks -We have also included an example Jupyter notebook - -Jupyter notebooks are useful tools for exploratory data analysis, prototyping baseline models and creating visualizations. However, they are _not_ an acceptable way to hand-off code for others to reproduce. Have you ever tried to run someone else's notebook, only to find out a cell was deleted, and you have no idea what it was supposed to do? - -[Don't put data science notebooks into production](https://martinfowler.com/articles/productize-data-sci-notebooks.html), they are [hard to test, version, parametrize and keep track of state](https://www.reddit.com/r/datascience/comments/ezh50g/jupyter_notebooks_in_productionno_just_no/). - -There _are_ [companies that use notebooks in production architecture](https://blog.goodaudience.com/inside-netflixs-notebook-driven-architecture-aedded32145e), but they have entire Devops organizations to help configure deployment and _still_ use workflow tools like [papermill](https://papermill.readthedocs.io/en/latest/) and Airflow to parametrize notebook dependencies. Unless you are willing to put in the effort to parametrize your notebooks in pipeline workflows, don't use them when stability and reproducibility matter. +This repository has unit tests in the `tests` directory configured using [pytest](https://pytest.org/) and the Github action defined in `.github/workflows/python_package.yml` will run tests every time you make a pull request to the main branch of the repository. -Best practices for working with notebooks are changing as they become more popular. However, for now most of these services are too expensive for our partners or difficult to configure. You can use a notebook for prototyping and exploratory analysis, but once the project moves forward, use [`nbconvert`](https://linuxhint.com/convert-jupyter-notebook-python/) to convert the notebook to python code, then add some tests! +You can run tests locally using `pytest` or `python -m pytest` from the command line from the root of the repository or configure them to be [run with a debugger in your IDE](https://code.visualstudio.com/docs/python/testing). diff --git a/data/.gitignore b/data/.gitignore index ff8bf23..837ab75 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -1 +1 @@ -/gutenberg_counts.csv +MSF_data \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index b002fd6..291ea91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,10 @@ dependencies = [ [project.optional-dependencies] # Extra dependencies only needed for running tests go here -test = ["pytest"] +test = [ + "pytest", + "requests_mock", + ] # Dependencies that are useful only to developers, like an autoformatter and support for visualizations in jupyter notebooks go here dev = [ diff --git a/settings.ini b/settings.ini new file mode 100644 index 0000000..a8b9962 --- /dev/null +++ b/settings.ini @@ -0,0 +1,4 @@ +[DHIS2Server] +username= +password= +server_url= \ No newline at end of file diff --git a/src/msfocr/data/__init__.py b/src/msfocr/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/msfocr/data/data_upload_DHIS2.py b/src/msfocr/data/data_upload_DHIS2.py index abd23bb..92808f6 100644 --- a/src/msfocr/data/data_upload_DHIS2.py +++ b/src/msfocr/data/data_upload_DHIS2.py @@ -1,8 +1,21 @@ +import configparser + import requests -dhis2_username = 'anju-santosh-kumar' -dhis2_password = 'AnjSOCR!01' -DHIS2_Test_Server_URL = 'https://ocr.twoca.org/' +# Set these before trying to make requests +DHIS2_USERNAME = None +DHIS2_PASSWORD = None +DHIS2_SERVER_URL = None + +# TODO It might be clearer to create a Server object class and have this be the __init__() function +def configure_DHIS2_server(config_path = "settings.ini"): + config = configparser.ConfigParser() + config.read(config_path) + dhis2_section = config["DHIS2Server"] + global DHIS2_SERVER_URL, DHIS2_USERNAME, DHIS2_PASSWORD + DHIS2_USERNAME = dhis2_section["username"] + DHIS2_PASSWORD = dhis2_section["password"] + DHIS2_SERVER_URL = dhis2_section["server_url"] # Command to get all fields that are organisationUnits @@ -14,14 +27,12 @@ def getUID(item_type, search_items): filter_param = 'filter=' + '&filter='.join([f'name:ilike:{term}' for term in search_items]) - url = f'{DHIS2_Test_Server_URL}/api/{item_type}?{filter_param}' - - response = requests.get(url, auth=(dhis2_username, dhis2_password)) + url = f'{DHIS2_SERVER_URL}/api/{item_type}?{filter_param}' + response = requests.get(url, auth=(DHIS2_USERNAME, DHIS2_PASSWORD)) if response.status_code == 401: raise ValueError("Authentication failed. Check your username and password.") data = response.json() - items = data[item_type] print(f"{len(data[item_type])} matches found for {search_items}") print(items) diff --git a/src/msfocr/docTR/ocr_functions.py b/src/msfocr/docTR/ocr_functions.py index beebaa7..81c494e 100644 --- a/src/msfocr/docTR/ocr_functions.py +++ b/src/msfocr/docTR/ocr_functions.py @@ -176,9 +176,11 @@ def generate_key_value_pairs(table): print(data_element, data_element_id) category_id = data_upload_DHIS2.getUID('categoryOptions', [category]) # Append to the list of data elements to be push to DHIS2 - data_element_pairs.append({ 'dataElement': data_element_id, + data_element_pairs.append( + {'dataElement': data_element_id, 'categoryOptions': category_id, - 'value': cell_value}) + 'value': cell_value} + ) return data_element_pairs diff --git a/tests/conftest.py b/tests/conftest.py index 20bbfb1..1f8a7b1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,31 @@ -import pytest +import configparser from pathlib import Path +import pytest + +from msfocr.data.data_upload_DHIS2 import configure_DHIS2_server + @pytest.fixture -def datadir(request): +def datadir(): # Path to the directory containing test data test_data_dir = Path(__file__).parent / 'data' return test_data_dir + + +@pytest.fixture +def test_server_config(tmp_path): + """Configure a mock DHIS2 server to mimic requests. + You will still need to use requests_mock to imitate responses from http://test.com. + """ + config = configparser.ConfigParser() + config["DHIS2Server"] = {"username": "tester", + "password": "testing_password", + "server_url": "http://test.com" + } + configpath = tmp_path / "test_settings.ini" + with configpath.open("w") as configfile: + config.write(configfile) + + configure_DHIS2_server(configpath) + + diff --git a/tests/test_ocr_functions.py b/tests/test_ocr_functions.py index b7712eb..311e06b 100644 --- a/tests/test_ocr_functions.py +++ b/tests/test_ocr_functions.py @@ -1,10 +1,11 @@ -from msfocr.docTR import ocr_functions from doctr.io import DocumentFile from doctr.models import ocr_predictor from img2table.document import Image from img2table.ocr import DocTR import pandas as pd +from msfocr.docTR import ocr_functions + def test_get_sheet_type(datadir): """ Tests if the tally sheet type (dataSet, orgUnit, period) detected for a sample image is correct. @@ -21,7 +22,7 @@ def test_get_sheet_type(datadir): assert sheet_type[2] == ["2024-06-25", "2024-06-30"] -def test_generate_key_value_pairs(): +def test_generate_key_value_pairs(test_server_config, requests_mock): """ Tests if the dataElement value in the key-value pairs is correct by providing sample tablular data. """ @@ -41,6 +42,11 @@ def test_generate_key_value_pairs(): '5-14y': [None, None, None] }) + requests_mock.get("http://test.com/api/dataElements?filter=name:ilike:BCG", json={"dataElements":[{"id": 1, "displayName": "AVAC_002 BCG"}]}) + requests_mock.get("http://test.com/api/categoryOptions?filter=name:ilike:0-11m", json={'categoryOptions': [{'id': 2, 'displayName': '0-11m'}]}) + requests_mock.get("http://test.com/api/dataElements?filter=name:ilike:Polio (OPV) 1 (from 6 wks)", json={'dataElements': [{'id': 3, 'displayName': 'AVAC_006 Polio (OPV) 1 (from 6 wks)'}]}) + requests_mock.get("http://test.com/api/categoryOptions?filter=name:ilike:12-59m", json={'categoryOptions': [{'id': 'tWRttYIzvBn', 'displayName': '12-59m'}]}) + answer = [{'dataElement': '', 'categoryOptions': '', 'value': '45+29'}, {'dataElement': '', 'categoryOptions': '', 'value': '30+18'}, {'dataElement': '', 'categoryOptions': '', 'value': '55+29'}]