Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

process validation: allow tasks without outfilegrp #296

Merged
merged 12 commits into from
Nov 15, 2019
1 change: 1 addition & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ ignored-modules=cv2,tesserocr,ocrd.model
ignore-patterns='.*generateds.*'
disable =
fixme,
trailing-whitespace,
logging-not-lazy,
inconsistent-return-statements,
invalid-name,
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

Changed:

* `ocrd process` uses the ocrd-tool.json of the tools to check whether output file group necessary, #296

## [2.0.0] - 2019-11-05

Changed:
Expand All @@ -23,6 +27,9 @@ Fixed:
* PAGE XML output references xsi:schemaLocation, #331
* Update Pillow to 6.2.0

Changed:
* `ocrd process`: task validation takes processor's ocrd-tool into account, #296

## [1.0.0] - 2019-10-18

* Workspace validation: Validate that files mentioned in pc:Page/@imageFilename exist in METS and on FS, #309
Expand Down
2 changes: 0 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ FROM ubuntu:19.10
MAINTAINER OCR-D
ENV DEBIAN_FRONTEND noninteractive
ENV PYTHONIOENCODING utf8
ENV LC_ALL C.UTF-8
ENV LANG C.UTF-8

WORKDIR /build-ocrd
COPY ocrd ./ocrd
Expand Down
21 changes: 3 additions & 18 deletions ocrd/ocrd/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
is_local_filename,
get_local_filename,
setOverrideLogLevel,
parse_json_string_or_file,

VERSION as OCRD_VERSION
)

Expand All @@ -23,27 +25,10 @@ def _set_root_logger_version(ctx, param, value): # pylint: disable=unused-arg
type=click.Choice(['OFF', 'ERROR', 'WARN', 'INFO', 'DEBUG', 'TRACE']),
default=None, callback=_set_root_logger_version)

def _parse_json_string_or_file(ctx, param, value='{}'): # pylint: disable=unused-argument
ret = None
err = None
try:
try:
with open(value, 'r') as f:
ret = json.load(f)
except FileNotFoundError:
ret = json.loads(value.strip())
if not isinstance(ret, dict):
err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, ret))
except json.decoder.JSONDecodeError as e:
err = ValueError("Error parsing '%s': %s" % (value, e))
if err:
raise err # pylint: disable=raising-bad-type
return ret

parameter_option = click.option('-p', '--parameter',
help="Parameters, either JSON string or path to JSON file",
default='{}',
callback=_parse_json_string_or_file)
callback=lambda ctx, param, value: parse_json_string_or_file(value))

def ocrd_cli_wrap_processor(processorClass, ocrd_tool=None, mets=None, working_dir=None, dump_json=False, version=False, **kwargs):
LOG = getLogger('ocrd_cli_wrap_processor')
Expand Down
21 changes: 13 additions & 8 deletions ocrd/ocrd/task_sequence.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
import json
from shlex import split as shlex_split
from distutils.spawn import find_executable as which # pylint: disable=import-error,no-name-in-module
from subprocess import run, PIPE

from ocrd_utils import getLogger
from ocrd_utils import getLogger, parse_json_string_or_file
from ocrd.processor.base import run_cli
from ocrd.resolver import Resolver

Expand Down Expand Up @@ -38,14 +39,18 @@ def __init__(self, executable, input_file_grps, output_file_grps, parameter_path
self.parameter_path = parameter_path

def validate(self):
if self.parameter_path and not os.access(self.parameter_path, os.R_OK):
raise Exception("Parameter file not readable: %s" % self.parameter_path)
if not self.input_file_grps:
raise Exception("Task must have input file group")
if not self.output_file_grps:
raise Exception("Task must have output file group")
if not which(self.executable):
raise Exception("Executable not found in PATH: %s" % self.executable)
result = run([self.executable, '--dump-json'], stdout=PIPE, check=True, universal_newlines=True)
ocrd_tool_json = json.loads(result.stdout)
# TODO check for required parameters in ocrd_tool
if self.parameter_path:
parse_json_string_or_file(self.parameter_path)
if not self.input_file_grps:
raise Exception("Task must have input file group")
if 'output_file_grp' in ocrd_tool_json and not self.output_file_grps:
raise Exception("Processor requires output_file_grp but none was provided.")
return True

def __str__(self):
ret = '%s -I %s -O %s' % (
Expand Down
Binary file removed ocrd_utils/ocrd_utils/.doctrees/environment.pickle
Binary file not shown.
30 changes: 25 additions & 5 deletions ocrd_utils/ocrd_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@

FS-related utilities

* ``is_string``, ``membername``, ``concat_padded``, ``nth_url_segment``, ``remove_non_path_from_url``
* ``is_string``, ``membername``, ``concat_padded``, ``nth_url_segment``, ``remove_non_path_from_url``, ``parse_json_string_or_file``

String and OOP utilities

Expand Down Expand Up @@ -85,6 +85,7 @@
'logging',
'membername',
'image_from_polygon',
'parse_json_string_or_file',
'points_from_bbox',
'points_from_polygon',
'points_from_x0y0x1y1',
Expand Down Expand Up @@ -114,12 +115,12 @@
]

import io
import re
import json
import sys
import logging
import os
from os import getcwd, chdir
from os.path import isfile, abspath as os_abspath
import re
from zipfile import ZipFile
import contextlib

Expand Down Expand Up @@ -243,7 +244,7 @@ def coordinates_of_segment(segment, parent_image, parent_coords):
def pushd_popd(newcwd=None):
try:
oldcwd = getcwd()
except FileNotFoundError as e:
except FileNotFoundError as e: # pylint: disable=unused-variable
# This happens when a directory is deleted before the context is exited
oldcwd = '/tmp'
try:
Expand Down Expand Up @@ -719,7 +720,6 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])):

def transform_coordinates(polygon, transform=None):
"""Apply an affine transformation to a set of points.

Augment the 2d numpy array of points ``polygon`` with a an extra
column of ones (homogeneous coordinates), then multiply with
the transformation matrix ``transform`` (or the identity matrix),
Expand Down Expand Up @@ -765,3 +765,23 @@ def xywh_from_points(points):
Construct a numeric dict representing a bounding box from polygon coordinates in page representation.
"""
return xywh_from_bbox(*bbox_from_points(points))

def parse_json_string_or_file(value='{}'): # pylint: disable=unused-argument
"""
Parse a string as either the path to a JSON object or a literal JSON object.
"""
ret = None
err = None
try:
try:
with open(value, 'r') as f:
ret = json.load(f)
except FileNotFoundError:
ret = json.loads(value.strip())
if not isinstance(ret, dict):
err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, ret))
except json.decoder.JSONDecodeError as e:
err = ValueError("Error parsing '%s': %s" % (value, e))
if err:
raise err # pylint: disable=raising-bad-type
return ret
24 changes: 0 additions & 24 deletions tests/test_decorators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from tempfile import TemporaryDirectory
from pathlib import Path

import click
from click.testing import CliRunner
Expand All @@ -11,7 +9,6 @@
ocrd_cli_options,
ocrd_loglevel,
ocrd_cli_wrap_processor,
_parse_json_string_or_file
) # pylint: disable=protected-access
from ocrd_utils.logging import setOverrideLogLevel, initLogging
from ocrd_utils import pushd_popd
Expand Down Expand Up @@ -89,27 +86,6 @@ def test_processor_run(self):
result = self.runner.invoke(cli_dummy_processor, ['--mets', 'mets.xml'])
self.assertEqual(result.exit_code, 0)

def test_parameters0(self):
self.assertEqual(_parse_json_string_or_file(None, None), {})
self.assertEqual(_parse_json_string_or_file(None, None, '{}'), {})
self.assertEqual(_parse_json_string_or_file(None, None, '{"foo": 32}'), {'foo': 32})
self.assertEqual(_parse_json_string_or_file(None, None, '{"foo": 32}'), {'foo': 32})

def test_parameter_file(self):
with TemporaryDirectory() as tempdir:
paramfile = str(Path(tempdir, '{}')) # XXX yes, the file is called '{}'
with open(paramfile, 'w') as f:
f.write('{"bar": 42}')
self.assertEqual(_parse_json_string_or_file(None, None, paramfile), {'bar': 42})
with pushd_popd(tempdir):
self.assertEqual(_parse_json_string_or_file(None, None), {'bar': 42})

def test_parameters_invalid(self):
with self.assertRaisesRegex(ValueError, 'Not a valid JSON object'):
_parse_json_string_or_file(None, None, '[]')
with self.assertRaisesRegex(ValueError, 'Error parsing'):
_parse_json_string_or_file(None, None, '[}')


if __name__ == '__main__':
main()
76 changes: 64 additions & 12 deletions tests/test_task_sequence.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,100 @@
# import os
# from tempfile import TemporaryDirectory
import os
import json
from tempfile import mkdtemp
from shutil import rmtree

from pathlib import Path

from tests.base import TestCase, main

from ocrd.task_sequence import ProcessorTask

SAMPLE_NAME = 'ocrd-sample-processor'
SAMPLE_OCRD_TOOL_JSON = '''{
"executable": "ocrd-sample-processor",
"description": "Do stuff and things",
"categories": ["Image foobaring"],
"steps": ["preprocessing/optimization/foobarization"],
"input_file_grp": ["OCR-D-IMG"],
"output_file_grp": ["OCR-D-IMG-BIN"],
"parameters": {
"param1": {
"type": "boolean",
"default": false,
"description": "param1 description"
}
}
}'''
SAMPLE_NAME_WITHOUT_OUTPUT_FILE_GRP = 'ocrd-sample-processor-without-file-grp'
SAMPLE_OCRD_TOOL_JSON_WITHOUT_OUTPUT_FILE_GRP = json.loads(SAMPLE_OCRD_TOOL_JSON)
del SAMPLE_OCRD_TOOL_JSON_WITHOUT_OUTPUT_FILE_GRP['output_file_grp']
SAMPLE_OCRD_TOOL_JSON_WITHOUT_OUTPUT_FILE_GRP = json.dumps(SAMPLE_OCRD_TOOL_JSON_WITHOUT_OUTPUT_FILE_GRP)

class TestTaskSequence(TestCase):

def tearDown(self):
rmtree(self.tempdir)

def setUp(self):
self.tempdir = mkdtemp(prefix='ocrd-task-sequence-')

p = Path(self.tempdir, SAMPLE_NAME)
p.write_text("""\
#!/usr/bin/env python
print('''%s''')
""" % SAMPLE_OCRD_TOOL_JSON)
p.chmod(0o777)

p = Path(self.tempdir, SAMPLE_NAME_WITHOUT_OUTPUT_FILE_GRP)
p.write_text("""\
#!/usr/bin/env python
print('''%s''')
""" % SAMPLE_OCRD_TOOL_JSON_WITHOUT_OUTPUT_FILE_GRP)
p.chmod(0o777)

os.environ['PATH'] = os.pathsep.join([self.tempdir, os.environ['PATH']])
# from distutils.spawn import find_executable as which # pylint: disable=import-error,no-name-in-module
# self.assertTrue(which('ocrd-sample-processor'))

def test_parse_no_in(self):
task = ProcessorTask.parse('sample-processor1')
task = ProcessorTask.parse('sample-processor')
with self.assertRaisesRegex(Exception, 'must have input file group'):
task.validate()

def test_parse_no_out(self):
task = ProcessorTask.parse('sample-processor1 -I IN')
with self.assertRaisesRegex(Exception, 'must have output file group'):
task = ProcessorTask.parse('sample-processor -I IN')
with self.assertRaisesRegex(Exception, 'Processor requires output_file_grp but none was provided.'):
task.validate()
# this should validate
task2 = ProcessorTask.parse('sample-processor-without-file-grp -I IN')
self.assertTrue(task2.validate())

def test_parse_unknown(self):
with self.assertRaisesRegex(Exception, 'Failed parsing task description'):
ProcessorTask.parse('sample-processor1 -x wrong wrong wrong')
ProcessorTask.parse('sample-processor -x wrong wrong wrong')

def test_parse_ok(self):
task_str = 'sample-processor1 -I IN -O OUT -p /path/to/param.json'
task_str = 'sample-processor -I IN -O OUT -p /path/to/param.json'
task = ProcessorTask.parse(task_str)
self.assertEqual(task.executable, 'ocrd-sample-processor1')
self.assertEqual(task.executable, 'ocrd-sample-processor')
self.assertEqual(task.input_file_grps, ['IN'])
self.assertEqual(task.output_file_grps, ['OUT'])
self.assertEqual(task.parameter_path, '/path/to/param.json')
self.assertEqual(str(task), task_str)

def test_parse_parameter_none(self):
task_str = 'sample-processor1 -I IN -O OUT1,OUT2'
task_str = 'sample-processor -I IN -O OUT1,OUT2'
task = ProcessorTask.parse(task_str)
self.assertEqual(task.parameter_path, None)
self.assertEqual(str(task), task_str)

def test_fail_validate_param(self):
task = ProcessorTask.parse('sample-processor1 -I IN -O OUT -p /path/to/param.json')
with self.assertRaisesRegex(Exception, 'Parameter file not readable'):
task = ProcessorTask.parse('sample-processor -I IN -O OUT -p /path/to/param.json')
with self.assertRaisesRegex(Exception, 'Error parsing'):
task.validate()

def test_fail_validate_executable(self):
task = ProcessorTask.parse('sample-processor1 -I IN -O OUT -p /tmp')
task = ProcessorTask.parse('no-such-processor -I IN')
with self.assertRaisesRegex(Exception, 'Executable not found in '):
task.validate()

Expand Down
28 changes: 28 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from os import getcwd
from tempfile import TemporaryDirectory
from pathlib import Path

from PIL import Image

Expand All @@ -18,6 +20,8 @@
nth_url_segment,
remove_non_path_from_url,

parse_json_string_or_file,

points_from_bbox,
points_from_x0y0x1y1,
points_from_xywh,
Expand Down Expand Up @@ -171,5 +175,29 @@ def test_nth_url_segment(self):
self.assertEqual(nth_url_segment('/path/to/foo#frag', n=-2), 'to')
self.assertEqual(nth_url_segment('https://server/foo?xyz=zyx'), 'foo')

def test_parse_json_string_or_file(self):
self.assertEqual(parse_json_string_or_file(), {})
self.assertEqual(parse_json_string_or_file('{}'), {})
self.assertEqual(parse_json_string_or_file('{"foo": 32}'), {'foo': 32})

def test_parameter_file(self):
"""
Verify that existing filenames get priority over valid JSON string interpretation
"""
with TemporaryDirectory() as tempdir:
paramfile = Path(tempdir, '{"foo":23}') # XXX yes, the file is called '{"foo":23}'
paramfile.write_text('{"bar": 42}')
# /tmp/<var>/{"foo":23} -- exists, read file and parse as JSON
self.assertEqual(parse_json_string_or_file(str(paramfile)), {'bar': 42})
# $PWD/{"foo":23} -- does not exist, parse as json
self.assertEqual(parse_json_string_or_file(paramfile.name), {'foo': 23})

def test_parameters_invalid(self):
with self.assertRaisesRegex(ValueError, 'Not a valid JSON object'):
parse_json_string_or_file('[]')
with self.assertRaisesRegex(ValueError, 'Error parsing'):
parse_json_string_or_file('[}')


if __name__ == '__main__':
main()