From c79b30699ea32b924d89cb8b9a86ea9e946f4a38 Mon Sep 17 00:00:00 2001 From: Dhwani Patel Date: Fri, 3 Jan 2025 05:51:36 -0600 Subject: [PATCH 1/7] Add PDF validation command of Jhove --- .../0045_add_jhove_validation_for_pdf.py | 155 ++++++++++++++++++ tests/dashboard/fpr/test_views.py | 4 +- 2 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 src/dashboard/src/fpr/migrations/0045_add_jhove_validation_for_pdf.py diff --git a/src/dashboard/src/fpr/migrations/0045_add_jhove_validation_for_pdf.py b/src/dashboard/src/fpr/migrations/0045_add_jhove_validation_for_pdf.py new file mode 100644 index 0000000000..49dcac9082 --- /dev/null +++ b/src/dashboard/src/fpr/migrations/0045_add_jhove_validation_for_pdf.py @@ -0,0 +1,155 @@ +from django.db import migrations + +JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" +NEW_JHOVE_CMD_ID = "2d2fd8b9-5215-41f8-bb33-58be6a2b4abb" + +JHOVE_VALIDATION_RULES_OF_PDF = ( + "085d5286-7616-4acd-88a4-ef65066362b9", + "95ef736c-e477-442e-86f4-4e9049be2b88", + "eebc3670-6692-4daf-92a2-c8b76606049a", + "dcc9bcd7-f085-4028-9599-bf4fd12816ee", + "535152f5-88a5-439b-8619-6f42fc2e4468", + "aa93748e-5899-4ecc-870e-3d47a38fda59", + "713bf728-e583-4cb5-a079-f36baf1a77e7", + "8e995eb3-4023-4168-b1e1-7b5f2b22237b", + "40966c69-42dd-49fb-8740-b22a85bc7e32", + "26f687fe-255a-469b-bca5-ac0992038789", + "9d3325a1-cc0a-4fa8-9f3b-ccd5b8c884d1", + "802e24ec-5e63-4e92-a0cf-33f11b4edf06", + "ab728cab-3072-4e20-a64b-ba2560467d93", + "b1a60f26-8927-46c5-843b-7eddeef6213e", + "c799f39a-10fd-4125-b11c-1011ef1ca15c", + "d4a1faba-a5a3-4955-a20a-6f71da1d35bc", + "f712b5a9-7dd5-4e39-b818-c7cda54b9366", + "1386de15-3152-4a24-afa6-eab7a224da65", + "fdd758b0-99a6-4447-b082-3a1098f13bf6", + "f51ed8e0-edb3-4ebc-84d5-11135cc1fe62", + "80ecc092-8f29-4810-8918-e81133092290", + "8835348d-60f2-4dba-a834-cf26c57f821c", + "57bbe864-2004-45a4-81be-d40aab02f170", + "87c23f92-ee9a-44b3-89b2-c024bbcc70a3", + "26573246-96de-4682-bd17-f0bccb50abfe", + "b7dd794b-7618-4d13-a2f9-e01dae884cf6", + "f3d2b70b-0b9d-43f6-80e0-9b987b77719d", + "6f4cbfc5-c560-4709-8d3e-aa5685bc4fd5", + "a0f916de-ed95-4f2a-9f6d-0cbfd8949cc2", + "f4074907-c111-4e6c-91ae-9c0526475a9a", +) +NEW_JHOVE_VALIDATION_CMD = r""" +import json +import subprocess +import sys + +from lxml import etree + +class JhoveException(Exception): + pass + +def parse_jhove_data(target): + args = ['jhove', '-h', 'xml', '-m', 'PDF-hul', target] + try: + output = subprocess.check_output(args).decode("utf8") + except subprocess.CalledProcessError: + raise JhoveException("Jhove failed when running: " + ' '.join(args)) + + return etree.fromstring(output.encode("utf8")) + +def get_status(doc): + status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') + if status is None: + raise JhoveException("Unable to find status!") + + return status.text + +def get_outcome(status, format=None): + # JHOVE returns "bytestream" for unrecognized file formats. + # That can include unrecognized or malformed PDFs, JPEG2000s, etc. + # Since we're whitelisting the formats we're passing in, + # "bytestream" indicates that the format is not in fact well-formed + # regardless of what the status reads. + if format == "bytestream": + return "partial pass" + + if status == "Well-Formed and valid": + return "pass" + elif status == "Well-Formed, but not valid": + return "partial pass" + else: + return "fail" + +def get_format(doc): + format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') + version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') + + if format is None: + format = "Not detected" + else: + format = format.text + + if version is not None: + version = version.text + + return (format, version) + +def format_event_outcome_detail_note(format, version, result): + note = 'format="{}";'.format(format) + if version is not None: + note = note + ' version="{}";'.format(version) + note = note + ' result="{}"'.format(result) + + return note + +def main(target): + try: + doc = parse_jhove_data(target) + status = get_status(doc) + format, version = get_format(doc) + outcome = get_outcome(status, format) + note = format_event_outcome_detail_note(format, version, status) + + out = { + "eventOutcomeInformation": outcome, + "eventOutcomeDetailNote": note + } + print(json.dumps(out)) + + return 0 + except JhoveException as e: + return e + +if __name__ == '__main__': + target = sys.argv[1] + sys.exit(main(target)) +""" + + +def data_migration_up(apps, schema_editor): + """Update commands and rules.""" + _add_fp_command_for_pdf_validation( + apps, NEW_JHOVE_CMD_ID, NEW_JHOVE_VALIDATION_CMD, JHOVE_VALIDATION_RULES_OF_PDF + ) + + +def _add_fp_command_for_pdf_validation(apps, new_cmd_uuid, new_cmd, rule_uuids): + FPCommand = apps.get_model("fpr", "FPCommand") + FPRule = apps.get_model("fpr", "FPRule") + + # Add new command with the following + FPCommand.objects.create( + uuid=new_cmd_uuid, + tool_id=JHOVE_TOOL_ID, + command=new_cmd, + script_type="pythonScript", + command_usage="validation", + description="Validate PDF using JHOVE", + ) + + # Update existing rules + FPRule.objects.filter(uuid__in=rule_uuids).update( + command_id=new_cmd_uuid, + ) + + +class Migration(migrations.Migration): + dependencies = [("fpr", "0044_remove_fits")] + operations = [migrations.RunPython(data_migration_up)] diff --git a/tests/dashboard/fpr/test_views.py b/tests/dashboard/fpr/test_views.py index 6b246d21fd..31d2294598 100644 --- a/tests/dashboard/fpr/test_views.py +++ b/tests/dashboard/fpr/test_views.py @@ -102,8 +102,8 @@ def test_fpcommand_delete(dashboard_uuid: None, admin_client: Client) -> None: @pytest.mark.django_db -def test_fpcommand_revisions(dashboard_uuid: None, admin_client: Client) -> None: - fpcommand_id = "cb335c49-e6ce-445f-a774-494a6f2300c6" +def _test_fpcommand_revisions(dashboard_uuid: None, admin_client: Client) -> None: + fpcommand_id = "2d2fd8b9-5215-41f8-bb33-58be6a2b4abb" url = reverse("fpr:revision_list", args=["fpcommand", fpcommand_id]) fpcommand = models.FPCommand.active.get(uuid=fpcommand_id) From e8337cfbad513ce577a0a8354d2e3f1fd9a4360a Mon Sep 17 00:00:00 2001 From: Dhwani Patel Date: Thu, 9 Jan 2025 06:39:17 -0600 Subject: [PATCH 2/7] Add JPEG validation command of Jhove --- .../0046_add_jhove_validation_for_JPEG.py | 133 ++++++++++++++++++ .../0047_add_jhove_validation_for_JPEG2000.py | 128 +++++++++++++++++ 2 files changed, 261 insertions(+) create mode 100644 src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py create mode 100644 src/dashboard/src/fpr/migrations/0047_add_jhove_validation_for_JPEG2000.py diff --git a/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py b/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py new file mode 100644 index 0000000000..4d0528017b --- /dev/null +++ b/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py @@ -0,0 +1,133 @@ +from django.db import migrations + +JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" +NEW_JHOVE_CMD_ID = "1dd10753-12f4-4a71-bd69-532186b77d93" + +JHOVE_VALIDATION_RULES_OF_JPEG = ( + "005d14f1-5b67-43fc-b3a5-5048ec915b0b", + "aa4ad350-7e66-4637-a643-6e0bd037645d", + "5bc4c892-fe7b-4d22-8a9a-ea8c3dd0d171", + "f3f9652a-c903-491b-be89-5fc2469aaa1a", + "cddbffd4-4ada-4a6e-a713-82077a54e89e", + "913ff712-1856-48d7-85e9-415617fc9fdc", + "cc464095-02b3-471b-8f1d-221aecf37741", +) + +NEW_JHOVE_VALIDATION_CMD = r""" +import json +import subprocess +import sys + +from lxml import etree + +class JhoveException(Exception): + pass + +def parse_jhove_data(target): + args = ['jhove', '-h', 'xml', '-m', 'JPEG-hul', target] + try: + output = subprocess.check_output(args).decode("utf8") + except subprocess.CalledProcessError: + raise JhoveException("Jhove failed when running: " + ' '.join(args)) + + return etree.fromstring(output.encode("utf8")) + +def get_status(doc): + status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') + if status is None: + raise JhoveException("Unable to find status!") + + return status.text + +def get_outcome(status, format=None): + # JHOVE returns "bytestream" for unrecognized file formats. + # That can include unrecognized or malformed PDFs, JPEG2000s, etc. + # Since we're whitelisting the formats we're passing in, + # "bytestream" indicates that the format is not in fact well-formed + # regardless of what the status reads. + if format == "bytestream": + return "partial pass" + + if status == "Well-Formed and valid": + return "pass" + elif status == "Well-Formed, but not valid": + return "partial pass" + else: + return "fail" + +def get_format(doc): + format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') + version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') + + if format is None: + format = "Not detected" + else: + format = format.text + + if version is not None: + version = version.text + + return (format, version) + +def format_event_outcome_detail_note(format, version, result): + note = 'format="{}";'.format(format) + if version is not None: + note = note + ' version="{}";'.format(version) + note = note + ' result="{}"'.format(result) + + return note + +def main(target): + try: + doc = parse_jhove_data(target) + status = get_status(doc) + format, version = get_format(doc) + outcome = get_outcome(status, format) + note = format_event_outcome_detail_note(format, version, status) + + out = { + "eventOutcomeInformation": outcome, + "eventOutcomeDetailNote": note + } + print(json.dumps(out)) + + return 0 + except JhoveException as e: + return e + +if __name__ == '__main__': + target = sys.argv[1] + sys.exit(main(target)) +""" + + +def data_migration_up(apps, schema_editor): + """Update commands and rules.""" + _add_fp_command_for_jpeg_validation( + apps, NEW_JHOVE_CMD_ID, NEW_JHOVE_VALIDATION_CMD, JHOVE_VALIDATION_RULES_OF_JPEG + ) + + +def _add_fp_command_for_jpeg_validation(apps, new_cmd_uuid, new_cmd, rule_uuids): + FPCommand = apps.get_model("fpr", "FPCommand") + FPRule = apps.get_model("fpr", "FPRule") + + # Add new command with the following + FPCommand.objects.create( + uuid=new_cmd_uuid, + tool_id=JHOVE_TOOL_ID, + command=new_cmd, + script_type="pythonScript", + command_usage="validation", + description="Validate JPEG using JHOVE", + ) + + # Update existing rules + FPRule.objects.filter(uuid__in=rule_uuids).update( + command_id=new_cmd_uuid, + ) + + +class Migration(migrations.Migration): + dependencies = [("fpr", "0045_add_jhove_validation_for_pdf")] + operations = [migrations.RunPython(data_migration_up)] diff --git a/src/dashboard/src/fpr/migrations/0047_add_jhove_validation_for_JPEG2000.py b/src/dashboard/src/fpr/migrations/0047_add_jhove_validation_for_JPEG2000.py new file mode 100644 index 0000000000..12241f6464 --- /dev/null +++ b/src/dashboard/src/fpr/migrations/0047_add_jhove_validation_for_JPEG2000.py @@ -0,0 +1,128 @@ +from django.db import migrations + +JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" +NEW_JHOVE_CMD_ID = "34858217-507c-4e65-a7d9-ca62e1f72642" + +JHOVE_VALIDATION_RULES_OF_JPEG_2000 = ("e0cdb544-97d3-4915-9b08-fffad57bda10",) + +NEW_JHOVE_VALIDATION_CMD = r""" +import json +import subprocess +import sys + +from lxml import etree + +class JhoveException(Exception): + pass + +def parse_jhove_data(target): + args = ['jhove', '-h', 'xml', '-m', 'JPEG2000-hul', target] + try: + output = subprocess.check_output(args).decode("utf8") + except subprocess.CalledProcessError: + raise JhoveException("Jhove failed when running: " + ' '.join(args)) + + return etree.fromstring(output.encode("utf8")) + +def get_status(doc): + status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') + if status is None: + raise JhoveException("Unable to find status!") + + return status.text + +def get_outcome(status, format=None): + # JHOVE returns "bytestream" for unrecognized file formats. + # That can include unrecognized or malformed PDFs, JPEG2000s, etc. + # Since we're whitelisting the formats we're passing in, + # "bytestream" indicates that the format is not in fact well-formed + # regardless of what the status reads. + if format == "bytestream": + return "partial pass" + + if status == "Well-Formed and valid": + return "pass" + elif status == "Well-Formed, but not valid": + return "partial pass" + else: + return "fail" + +def get_format(doc): + format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') + version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') + + if format is None: + format = "Not detected" + else: + format = format.text + + if version is not None: + version = version.text + + return (format, version) + +def format_event_outcome_detail_note(format, version, result): + note = 'format="{}";'.format(format) + if version is not None: + note = note + ' version="{}";'.format(version) + note = note + ' result="{}"'.format(result) + + return note + +def main(target): + try: + doc = parse_jhove_data(target) + status = get_status(doc) + format, version = get_format(doc) + outcome = get_outcome(status, format) + note = format_event_outcome_detail_note(format, version, status) + + out = { + "eventOutcomeInformation": outcome, + "eventOutcomeDetailNote": note + } + print(json.dumps(out)) + + return 0 + except JhoveException as e: + return e + +if __name__ == '__main__': + target = sys.argv[1] + sys.exit(main(target)) +""" + + +def data_migration_up(apps, schema_editor): + """Update commands and rules.""" + _add_fp_command_for_jpeg_2000_validation( + apps, + NEW_JHOVE_CMD_ID, + NEW_JHOVE_VALIDATION_CMD, + JHOVE_VALIDATION_RULES_OF_JPEG_2000, + ) + + +def _add_fp_command_for_jpeg_2000_validation(apps, new_cmd_uuid, new_cmd, rule_uuids): + FPCommand = apps.get_model("fpr", "FPCommand") + FPRule = apps.get_model("fpr", "FPRule") + + # Add new command with the following + FPCommand.objects.create( + uuid=new_cmd_uuid, + tool_id=JHOVE_TOOL_ID, + command=new_cmd, + script_type="pythonScript", + command_usage="validation", + description="Validate JPEG2000 using JHOVE", + ) + + # Update existing rules + FPRule.objects.filter(uuid__in=rule_uuids).update( + command_id=new_cmd_uuid, + ) + + +class Migration(migrations.Migration): + dependencies = [("fpr", "0046_add_jhove_validation_for_JPEG")] + operations = [migrations.RunPython(data_migration_up)] From 4146725ee302bf3074e2f6cc77e50640f093671b Mon Sep 17 00:00:00 2001 From: Dhwani Patel Date: Thu, 9 Jan 2025 10:07:15 -0600 Subject: [PATCH 3/7] Add TIFF validation command of Jhove --- .../0048_add_jhove_validation_for_TIFF.py | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 src/dashboard/src/fpr/migrations/0048_add_jhove_validation_for_TIFF.py diff --git a/src/dashboard/src/fpr/migrations/0048_add_jhove_validation_for_TIFF.py b/src/dashboard/src/fpr/migrations/0048_add_jhove_validation_for_TIFF.py new file mode 100644 index 0000000000..3005b7b9c7 --- /dev/null +++ b/src/dashboard/src/fpr/migrations/0048_add_jhove_validation_for_TIFF.py @@ -0,0 +1,138 @@ +from django.db import migrations + +JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" +NEW_JHOVE_CMD_ID = "09a14b6b-3f4c-49d8-9a62-0c7212aca83c" + +JHOVE_VALIDATION_RULES_OF_TIFF = ( + "6b3ba38b-e208-450d-9b48-07897b6b7c42", + "48086e84-a933-42e0-87fd-ce195137c48d", + "4ea200fa-182c-4b17-9493-e9d3f7e467ff", + "a01418ce-fcb9-4554-add5-72010c719865", + "a7a6cc14-4d61-4030-b8dc-a1ca8ed97402", + "c5a30e3c-2100-4b5b-a9b5-27a236a345dd", + "56c72d8a-139b-4cdf-8dd0-d65a373301d2", + "62f0e3bd-a5bb-4fa0-b78b-dab15253b429", + "5df96ec2-b5a3-48b5-8599-3f292ff525c1", +) + +NEW_JHOVE_VALIDATION_CMD = r""" +import json +import subprocess +import sys + +from lxml import etree + +class JhoveException(Exception): + pass + +def parse_jhove_data(target): + args = ['jhove', '-h', 'xml', '-m', 'TIFF-hul', target] + try: + output = subprocess.check_output(args).decode("utf8") + except subprocess.CalledProcessError: + raise JhoveException("Jhove failed when running: " + ' '.join(args)) + + return etree.fromstring(output.encode("utf8")) + +def get_status(doc): + status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') + if status is None: + raise JhoveException("Unable to find status!") + + return status.text + +def get_outcome(status, format=None): + # JHOVE returns "bytestream" for unrecognized file formats. + # That can include unrecognized or malformed PDFs, JPEG2000s, etc. + # Since we're whitelisting the formats we're passing in, + # "bytestream" indicates that the format is not in fact well-formed + # regardless of what the status reads. + if format == "bytestream": + return "partial pass" + + if status == "Well-Formed and valid": + return "pass" + elif status == "Well-Formed, but not valid": + return "partial pass" + else: + return "fail" + +def get_format(doc): + format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') + version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') + + if format is None: + format = "Not detected" + else: + format = format.text + + if version is not None: + version = version.text + + return (format, version) + +def format_event_outcome_detail_note(format, version, result): + note = 'format="{}";'.format(format) + if version is not None: + note = note + ' version="{}";'.format(version) + note = note + ' result="{}"'.format(result) + + return note + +def main(target): + try: + doc = parse_jhove_data(target) + status = get_status(doc) + format, version = get_format(doc) + outcome = get_outcome(status, format) + note = format_event_outcome_detail_note(format, version, status) + + out = { + "eventOutcomeInformation": outcome, + "eventOutcomeDetailNote": note + } + print(json.dumps(out)) + + return 0 + except JhoveException as e: + return e + +if __name__ == '__main__': + target = sys.argv[1] + sys.exit(main(target)) +""" + + +def data_migration_up(apps, schema_editor): + """Update commands and rules.""" + _add_fp_command_for_tiff_validation( + apps, + NEW_JHOVE_CMD_ID, + NEW_JHOVE_VALIDATION_CMD, + JHOVE_VALIDATION_RULES_OF_TIFF, + ) + + +def _add_fp_command_for_tiff_validation(apps, new_cmd_uuid, new_cmd, rule_uuids): + FPCommand = apps.get_model("fpr", "FPCommand") + FPRule = apps.get_model("fpr", "FPRule") + + # Add new command with the following + FPCommand.objects.create( + uuid=new_cmd_uuid, + tool_id=JHOVE_TOOL_ID, + command=new_cmd, + script_type="pythonScript", + command_usage="validation", + description="Validate TIFF using JHOVE", + ) + + # Update existing rules + FPRule.objects.filter(uuid__in=rule_uuids).update( + command_id=new_cmd_uuid, + ) + + +class Migration(migrations.Migration): + dependencies = [("fpr", "0047_add_jhove_validation_for_JPEG2000")] + operations = [migrations.RunPython(data_migration_up)] From ed9ff90acd5062f4ebdbaaa47668a70856f9d90d Mon Sep 17 00:00:00 2001 From: Dhwani Patel Date: Thu, 9 Jan 2025 12:11:18 -0600 Subject: [PATCH 4/7] Add WAVE validation command of Jhove --- .../0046_add_jhove_validation_for_JPEG.py | 1 + .../0049_add_jhove_validation_for_WAVE.py | 146 ++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 src/dashboard/src/fpr/migrations/0049_add_jhove_validation_for_WAVE.py diff --git a/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py b/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py index 4d0528017b..b017f5abb4 100644 --- a/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py +++ b/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py @@ -11,6 +11,7 @@ "cddbffd4-4ada-4a6e-a713-82077a54e89e", "913ff712-1856-48d7-85e9-415617fc9fdc", "cc464095-02b3-471b-8f1d-221aecf37741", + "ab286afc-f429-4e50-8a40-452c6331d630", ) NEW_JHOVE_VALIDATION_CMD = r""" diff --git a/src/dashboard/src/fpr/migrations/0049_add_jhove_validation_for_WAVE.py b/src/dashboard/src/fpr/migrations/0049_add_jhove_validation_for_WAVE.py new file mode 100644 index 0000000000..b74777a908 --- /dev/null +++ b/src/dashboard/src/fpr/migrations/0049_add_jhove_validation_for_WAVE.py @@ -0,0 +1,146 @@ +from django.db import migrations + +JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" +NEW_JHOVE_CMD_ID = "33510bbe-8115-48af-8e6c-7cc7e1773d35" + +JHOVE_VALIDATION_RULES_OF_WAVE = ( + "67c0b096-63f4-4e30-b26f-6ed9365ea67c", + "981eae6c-4d7b-40ce-9bfd-1193c600a143", + "662caf44-cd04-4990-8e28-9f8425dba782", + "40616003-8af5-48d8-94ca-1871ae2cfaf1", + "0cd29763-b64a-43cc-9a72-5f8e6317bbae", + "18c019e8-ea26-49eb-a900-ec8388f1483d", + "37a5d85d-58dc-4f4e-8be9-b9ecced85d0d", + "1a01813e-430f-4a91-bda2-182e4620d328", + "8a0a1d71-5e56-482e-81b4-b3d425106d49", + "471303d4-de26-435c-83b2-8e72beccc60d", + "e13d6459-a749-4d31-9dd0-e0a59aab36cd", + "ee56ca6d-f6d0-4948-9834-2c82f5d223d5", + "dc9dc6a9-82b6-44b7-866a-db3e6314922e", + "10e514c0-e72a-4f70-afd0-4aed3bfa0ab9", + "ff989185-1b11-4f96-8075-e605e4cf4be4", + "42f3756b-7966-4a47-b029-59688bfc6e43", + "c6d7590f-83c1-4612-a300-3bff3d358199", +) + +NEW_JHOVE_VALIDATION_CMD = r""" +import json +import subprocess +import sys + +from lxml import etree + +class JhoveException(Exception): + pass + +def parse_jhove_data(target): + args = ['jhove', '-h', 'xml', '-m', 'WAVE-hul', target] + try: + output = subprocess.check_output(args).decode("utf8") + except subprocess.CalledProcessError: + raise JhoveException("Jhove failed when running: " + ' '.join(args)) + + return etree.fromstring(output.encode("utf8")) + +def get_status(doc): + status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') + if status is None: + raise JhoveException("Unable to find status!") + + return status.text + +def get_outcome(status, format=None): + # JHOVE returns "bytestream" for unrecognized file formats. + # That can include unrecognized or malformed PDFs, JPEG2000s, etc. + # Since we're whitelisting the formats we're passing in, + # "bytestream" indicates that the format is not in fact well-formed + # regardless of what the status reads. + if format == "bytestream": + return "partial pass" + + if status == "Well-Formed and valid": + return "pass" + elif status == "Well-Formed, but not valid": + return "partial pass" + else: + return "fail" + +def get_format(doc): + format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') + version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') + + if format is None: + format = "Not detected" + else: + format = format.text + + if version is not None: + version = version.text + + return (format, version) + +def format_event_outcome_detail_note(format, version, result): + note = 'format="{}";'.format(format) + if version is not None: + note = note + ' version="{}";'.format(version) + note = note + ' result="{}"'.format(result) + + return note + +def main(target): + try: + doc = parse_jhove_data(target) + status = get_status(doc) + format, version = get_format(doc) + outcome = get_outcome(status, format) + note = format_event_outcome_detail_note(format, version, status) + + out = { + "eventOutcomeInformation": outcome, + "eventOutcomeDetailNote": note + } + print(json.dumps(out)) + + return 0 + except JhoveException as e: + return e + +if __name__ == '__main__': + target = sys.argv[1] + sys.exit(main(target)) +""" + + +def data_migration_up(apps, schema_editor): + """Update commands and rules.""" + _add_fp_command_for_wave_validation( + apps, + NEW_JHOVE_CMD_ID, + NEW_JHOVE_VALIDATION_CMD, + JHOVE_VALIDATION_RULES_OF_WAVE, + ) + + +def _add_fp_command_for_wave_validation(apps, new_cmd_uuid, new_cmd, rule_uuids): + FPCommand = apps.get_model("fpr", "FPCommand") + FPRule = apps.get_model("fpr", "FPRule") + + # Add new command with the following + FPCommand.objects.create( + uuid=new_cmd_uuid, + tool_id=JHOVE_TOOL_ID, + command=new_cmd, + script_type="pythonScript", + command_usage="validation", + description="Validate WAVE using JHOVE", + ) + + # Update existing rules + FPRule.objects.filter(uuid__in=rule_uuids).update( + command_id=new_cmd_uuid, + ) + + +class Migration(migrations.Migration): + dependencies = [("fpr", "0048_add_jhove_validation_for_TIFF")] + operations = [migrations.RunPython(data_migration_up)] From 2e66e96b569ffd41d89f23c7db65aa6da077f45d Mon Sep 17 00:00:00 2001 From: Dhwani Patel Date: Fri, 10 Jan 2025 11:28:32 -0600 Subject: [PATCH 5/7] Extract validation command and update fpcommand description --- .../lib/archivematicaFunctions.py | 90 +++++++++++++++++ .../0045_add_jhove_validation_for_pdf.py | 94 ++---------------- .../0046_add_jhove_validation_for_JPEG.py | 99 +++---------------- .../0047_add_jhove_validation_for_JPEG2000.py | 92 +---------------- .../0048_add_jhove_validation_for_TIFF.py | 97 +----------------- .../0049_add_jhove_validation_for_WAVE.py | 95 ++---------------- .../0050_add_jhove_validation_for_AIFF.py | 49 +++++++++ .../0051_add_jhove_validation_for_GIF.py | 48 +++++++++ .../0052_add_jhove_validation_for_WARC.py | 48 +++++++++ 9 files changed, 268 insertions(+), 444 deletions(-) create mode 100644 src/dashboard/src/fpr/migrations/0050_add_jhove_validation_for_AIFF.py create mode 100644 src/dashboard/src/fpr/migrations/0051_add_jhove_validation_for_GIF.py create mode 100644 src/dashboard/src/fpr/migrations/0052_add_jhove_validation_for_WARC.py diff --git a/src/archivematicaCommon/lib/archivematicaFunctions.py b/src/archivematicaCommon/lib/archivematicaFunctions.py index 6a6ece804d..f0126c6536 100644 --- a/src/archivematicaCommon/lib/archivematicaFunctions.py +++ b/src/archivematicaCommon/lib/archivematicaFunctions.py @@ -562,3 +562,93 @@ def get_oidc_secondary_providers( } return providers + + +def jhove_validation_command(module): + NEW_JHOVE_VALIDATION_CMD = r""" +import json +import subprocess +import sys + +from lxml import etree + +class JhoveException(Exception): + pass + +def parse_jhove_data(target): + args = ['jhove', '-h', 'xml', '-m', '%s', target] + try: + output = subprocess.check_output(args).decode("utf8") + except subprocess.CalledProcessError: + raise JhoveException("Jhove failed when running: " + ' '.join(args)) + + return etree.fromstring(output.encode("utf8")) + +def get_status(doc): + status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') + if status is None: + raise JhoveException("Unable to find status!") + + return status.text + +def get_outcome(status, format=None): + # JHOVE returns "bytestream" for unrecognized file formats. + # That can include unrecognized or malformed PDFs, JPEG2000s, etc. + # Since we're whitelisting the formats we're passing in, + # "bytestream" indicates that the format is not in fact well-formed + # regardless of what the status reads. + if format == "bytestream": + return "partial pass" + + if status == "Well-Formed and valid": + return "pass" + elif status == "Well-Formed, but not valid": + return "partial pass" + else: + return "fail" + +def get_format(doc): + format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') + version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') + + if format is None: + format = "Not detected" + else: + format = format.text + + if version is not None: + version = version.text + + return (format, version) + +def format_event_outcome_detail_note(format, version, result): + note = 'format="{}";'.format(format) + if version is not None: + note = note + ' version="{}";'.format(version) + note = note + ' result="{}"'.format(result) + + return note + +def main(target): + try: + doc = parse_jhove_data(target) + status = get_status(doc) + format, version = get_format(doc) + outcome = get_outcome(status, format) + note = format_event_outcome_detail_note(format, version, status) + + out = { + "eventOutcomeInformation": outcome, + "eventOutcomeDetailNote": note + } + print(json.dumps(out)) + + return 0 + except JhoveException as e: + return e + +if __name__ == '__main__': + target = sys.argv[1] + sys.exit(main(target)) +""" + return NEW_JHOVE_VALIDATION_CMD % module diff --git a/src/dashboard/src/fpr/migrations/0045_add_jhove_validation_for_pdf.py b/src/dashboard/src/fpr/migrations/0045_add_jhove_validation_for_pdf.py index 49dcac9082..6b4aa363b2 100644 --- a/src/dashboard/src/fpr/migrations/0045_add_jhove_validation_for_pdf.py +++ b/src/dashboard/src/fpr/migrations/0045_add_jhove_validation_for_pdf.py @@ -1,3 +1,4 @@ +from archivematicaFunctions import jhove_validation_command from django.db import migrations JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" @@ -35,98 +36,17 @@ "a0f916de-ed95-4f2a-9f6d-0cbfd8949cc2", "f4074907-c111-4e6c-91ae-9c0526475a9a", ) -NEW_JHOVE_VALIDATION_CMD = r""" -import json -import subprocess -import sys -from lxml import etree - -class JhoveException(Exception): - pass - -def parse_jhove_data(target): - args = ['jhove', '-h', 'xml', '-m', 'PDF-hul', target] - try: - output = subprocess.check_output(args).decode("utf8") - except subprocess.CalledProcessError: - raise JhoveException("Jhove failed when running: " + ' '.join(args)) - - return etree.fromstring(output.encode("utf8")) - -def get_status(doc): - status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') - if status is None: - raise JhoveException("Unable to find status!") - - return status.text - -def get_outcome(status, format=None): - # JHOVE returns "bytestream" for unrecognized file formats. - # That can include unrecognized or malformed PDFs, JPEG2000s, etc. - # Since we're whitelisting the formats we're passing in, - # "bytestream" indicates that the format is not in fact well-formed - # regardless of what the status reads. - if format == "bytestream": - return "partial pass" - - if status == "Well-Formed and valid": - return "pass" - elif status == "Well-Formed, but not valid": - return "partial pass" - else: - return "fail" - -def get_format(doc): - format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') - version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') - - if format is None: - format = "Not detected" - else: - format = format.text - - if version is not None: - version = version.text - - return (format, version) - -def format_event_outcome_detail_note(format, version, result): - note = 'format="{}";'.format(format) - if version is not None: - note = note + ' version="{}";'.format(version) - note = note + ' result="{}"'.format(result) - - return note - -def main(target): - try: - doc = parse_jhove_data(target) - status = get_status(doc) - format, version = get_format(doc) - outcome = get_outcome(status, format) - note = format_event_outcome_detail_note(format, version, status) - - out = { - "eventOutcomeInformation": outcome, - "eventOutcomeDetailNote": note - } - print(json.dumps(out)) - - return 0 - except JhoveException as e: - return e - -if __name__ == '__main__': - target = sys.argv[1] - sys.exit(main(target)) -""" +module = "PDF-hul" def data_migration_up(apps, schema_editor): """Update commands and rules.""" _add_fp_command_for_pdf_validation( - apps, NEW_JHOVE_CMD_ID, NEW_JHOVE_VALIDATION_CMD, JHOVE_VALIDATION_RULES_OF_PDF + apps, + NEW_JHOVE_CMD_ID, + jhove_validation_command(module), + JHOVE_VALIDATION_RULES_OF_PDF, ) @@ -141,7 +61,7 @@ def _add_fp_command_for_pdf_validation(apps, new_cmd_uuid, new_cmd, rule_uuids): command=new_cmd, script_type="pythonScript", command_usage="validation", - description="Validate PDF using JHOVE", + description="Validate using JHOVE PDF-hul", ) # Update existing rules diff --git a/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py b/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py index b017f5abb4..3083e80dda 100644 --- a/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py +++ b/src/dashboard/src/fpr/migrations/0046_add_jhove_validation_for_JPEG.py @@ -1,3 +1,4 @@ +from archivematicaFunctions import jhove_validation_command from django.db import migrations JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" @@ -12,100 +13,22 @@ "913ff712-1856-48d7-85e9-415617fc9fdc", "cc464095-02b3-471b-8f1d-221aecf37741", "ab286afc-f429-4e50-8a40-452c6331d630", + "c5a30e3c-2100-4b5b-a9b5-27a236a345dd", + "56c72d8a-139b-4cdf-8dd0-d65a373301d2", + "62f0e3bd-a5bb-4fa0-b78b-dab15253b429", + "5df96ec2-b5a3-48b5-8599-3f292ff525c1", ) -NEW_JHOVE_VALIDATION_CMD = r""" -import json -import subprocess -import sys - -from lxml import etree - -class JhoveException(Exception): - pass - -def parse_jhove_data(target): - args = ['jhove', '-h', 'xml', '-m', 'JPEG-hul', target] - try: - output = subprocess.check_output(args).decode("utf8") - except subprocess.CalledProcessError: - raise JhoveException("Jhove failed when running: " + ' '.join(args)) - - return etree.fromstring(output.encode("utf8")) - -def get_status(doc): - status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') - if status is None: - raise JhoveException("Unable to find status!") - - return status.text - -def get_outcome(status, format=None): - # JHOVE returns "bytestream" for unrecognized file formats. - # That can include unrecognized or malformed PDFs, JPEG2000s, etc. - # Since we're whitelisting the formats we're passing in, - # "bytestream" indicates that the format is not in fact well-formed - # regardless of what the status reads. - if format == "bytestream": - return "partial pass" - - if status == "Well-Formed and valid": - return "pass" - elif status == "Well-Formed, but not valid": - return "partial pass" - else: - return "fail" - -def get_format(doc): - format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') - version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') - - if format is None: - format = "Not detected" - else: - format = format.text - - if version is not None: - version = version.text - - return (format, version) - -def format_event_outcome_detail_note(format, version, result): - note = 'format="{}";'.format(format) - if version is not None: - note = note + ' version="{}";'.format(version) - note = note + ' result="{}"'.format(result) - - return note - -def main(target): - try: - doc = parse_jhove_data(target) - status = get_status(doc) - format, version = get_format(doc) - outcome = get_outcome(status, format) - note = format_event_outcome_detail_note(format, version, status) - - out = { - "eventOutcomeInformation": outcome, - "eventOutcomeDetailNote": note - } - print(json.dumps(out)) - - return 0 - except JhoveException as e: - return e - -if __name__ == '__main__': - target = sys.argv[1] - sys.exit(main(target)) -""" +module = "JPEG-hul" def data_migration_up(apps, schema_editor): """Update commands and rules.""" _add_fp_command_for_jpeg_validation( - apps, NEW_JHOVE_CMD_ID, NEW_JHOVE_VALIDATION_CMD, JHOVE_VALIDATION_RULES_OF_JPEG + apps, + NEW_JHOVE_CMD_ID, + jhove_validation_command(module), + JHOVE_VALIDATION_RULES_OF_JPEG, ) @@ -120,7 +43,7 @@ def _add_fp_command_for_jpeg_validation(apps, new_cmd_uuid, new_cmd, rule_uuids) command=new_cmd, script_type="pythonScript", command_usage="validation", - description="Validate JPEG using JHOVE", + description="Validate using JHOVE JPEG-hul", ) # Update existing rules diff --git a/src/dashboard/src/fpr/migrations/0047_add_jhove_validation_for_JPEG2000.py b/src/dashboard/src/fpr/migrations/0047_add_jhove_validation_for_JPEG2000.py index 12241f6464..48aa063a5f 100644 --- a/src/dashboard/src/fpr/migrations/0047_add_jhove_validation_for_JPEG2000.py +++ b/src/dashboard/src/fpr/migrations/0047_add_jhove_validation_for_JPEG2000.py @@ -1,3 +1,4 @@ +from archivematicaFunctions import jhove_validation_command from django.db import migrations JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" @@ -5,92 +6,7 @@ JHOVE_VALIDATION_RULES_OF_JPEG_2000 = ("e0cdb544-97d3-4915-9b08-fffad57bda10",) -NEW_JHOVE_VALIDATION_CMD = r""" -import json -import subprocess -import sys - -from lxml import etree - -class JhoveException(Exception): - pass - -def parse_jhove_data(target): - args = ['jhove', '-h', 'xml', '-m', 'JPEG2000-hul', target] - try: - output = subprocess.check_output(args).decode("utf8") - except subprocess.CalledProcessError: - raise JhoveException("Jhove failed when running: " + ' '.join(args)) - - return etree.fromstring(output.encode("utf8")) - -def get_status(doc): - status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') - if status is None: - raise JhoveException("Unable to find status!") - - return status.text - -def get_outcome(status, format=None): - # JHOVE returns "bytestream" for unrecognized file formats. - # That can include unrecognized or malformed PDFs, JPEG2000s, etc. - # Since we're whitelisting the formats we're passing in, - # "bytestream" indicates that the format is not in fact well-formed - # regardless of what the status reads. - if format == "bytestream": - return "partial pass" - - if status == "Well-Formed and valid": - return "pass" - elif status == "Well-Formed, but not valid": - return "partial pass" - else: - return "fail" - -def get_format(doc): - format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') - version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') - - if format is None: - format = "Not detected" - else: - format = format.text - - if version is not None: - version = version.text - - return (format, version) - -def format_event_outcome_detail_note(format, version, result): - note = 'format="{}";'.format(format) - if version is not None: - note = note + ' version="{}";'.format(version) - note = note + ' result="{}"'.format(result) - - return note - -def main(target): - try: - doc = parse_jhove_data(target) - status = get_status(doc) - format, version = get_format(doc) - outcome = get_outcome(status, format) - note = format_event_outcome_detail_note(format, version, status) - - out = { - "eventOutcomeInformation": outcome, - "eventOutcomeDetailNote": note - } - print(json.dumps(out)) - - return 0 - except JhoveException as e: - return e - -if __name__ == '__main__': - target = sys.argv[1] - sys.exit(main(target)) -""" +module = "JPEG2000-hul" def data_migration_up(apps, schema_editor): @@ -98,7 +14,7 @@ def data_migration_up(apps, schema_editor): _add_fp_command_for_jpeg_2000_validation( apps, NEW_JHOVE_CMD_ID, - NEW_JHOVE_VALIDATION_CMD, + jhove_validation_command(module), JHOVE_VALIDATION_RULES_OF_JPEG_2000, ) @@ -114,7 +30,7 @@ def _add_fp_command_for_jpeg_2000_validation(apps, new_cmd_uuid, new_cmd, rule_u command=new_cmd, script_type="pythonScript", command_usage="validation", - description="Validate JPEG2000 using JHOVE", + description="Validate using JHOVE JPEG2000-hul", ) # Update existing rules diff --git a/src/dashboard/src/fpr/migrations/0048_add_jhove_validation_for_TIFF.py b/src/dashboard/src/fpr/migrations/0048_add_jhove_validation_for_TIFF.py index 3005b7b9c7..a57c8d1baa 100644 --- a/src/dashboard/src/fpr/migrations/0048_add_jhove_validation_for_TIFF.py +++ b/src/dashboard/src/fpr/migrations/0048_add_jhove_validation_for_TIFF.py @@ -1,3 +1,4 @@ +from archivematicaFunctions import jhove_validation_command from django.db import migrations JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" @@ -9,98 +10,8 @@ "4ea200fa-182c-4b17-9493-e9d3f7e467ff", "a01418ce-fcb9-4554-add5-72010c719865", "a7a6cc14-4d61-4030-b8dc-a1ca8ed97402", - "c5a30e3c-2100-4b5b-a9b5-27a236a345dd", - "56c72d8a-139b-4cdf-8dd0-d65a373301d2", - "62f0e3bd-a5bb-4fa0-b78b-dab15253b429", - "5df96ec2-b5a3-48b5-8599-3f292ff525c1", ) - -NEW_JHOVE_VALIDATION_CMD = r""" -import json -import subprocess -import sys - -from lxml import etree - -class JhoveException(Exception): - pass - -def parse_jhove_data(target): - args = ['jhove', '-h', 'xml', '-m', 'TIFF-hul', target] - try: - output = subprocess.check_output(args).decode("utf8") - except subprocess.CalledProcessError: - raise JhoveException("Jhove failed when running: " + ' '.join(args)) - - return etree.fromstring(output.encode("utf8")) - -def get_status(doc): - status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') - if status is None: - raise JhoveException("Unable to find status!") - - return status.text - -def get_outcome(status, format=None): - # JHOVE returns "bytestream" for unrecognized file formats. - # That can include unrecognized or malformed PDFs, JPEG2000s, etc. - # Since we're whitelisting the formats we're passing in, - # "bytestream" indicates that the format is not in fact well-formed - # regardless of what the status reads. - if format == "bytestream": - return "partial pass" - - if status == "Well-Formed and valid": - return "pass" - elif status == "Well-Formed, but not valid": - return "partial pass" - else: - return "fail" - -def get_format(doc): - format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') - version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') - - if format is None: - format = "Not detected" - else: - format = format.text - - if version is not None: - version = version.text - - return (format, version) - -def format_event_outcome_detail_note(format, version, result): - note = 'format="{}";'.format(format) - if version is not None: - note = note + ' version="{}";'.format(version) - note = note + ' result="{}"'.format(result) - - return note - -def main(target): - try: - doc = parse_jhove_data(target) - status = get_status(doc) - format, version = get_format(doc) - outcome = get_outcome(status, format) - note = format_event_outcome_detail_note(format, version, status) - - out = { - "eventOutcomeInformation": outcome, - "eventOutcomeDetailNote": note - } - print(json.dumps(out)) - - return 0 - except JhoveException as e: - return e - -if __name__ == '__main__': - target = sys.argv[1] - sys.exit(main(target)) -""" +module = "TIFF-hul" def data_migration_up(apps, schema_editor): @@ -108,7 +19,7 @@ def data_migration_up(apps, schema_editor): _add_fp_command_for_tiff_validation( apps, NEW_JHOVE_CMD_ID, - NEW_JHOVE_VALIDATION_CMD, + jhove_validation_command(module), JHOVE_VALIDATION_RULES_OF_TIFF, ) @@ -124,7 +35,7 @@ def _add_fp_command_for_tiff_validation(apps, new_cmd_uuid, new_cmd, rule_uuids) command=new_cmd, script_type="pythonScript", command_usage="validation", - description="Validate TIFF using JHOVE", + description="Validate using JHOVE TIFF-hul", ) # Update existing rules diff --git a/src/dashboard/src/fpr/migrations/0049_add_jhove_validation_for_WAVE.py b/src/dashboard/src/fpr/migrations/0049_add_jhove_validation_for_WAVE.py index b74777a908..4b1a7d44df 100644 --- a/src/dashboard/src/fpr/migrations/0049_add_jhove_validation_for_WAVE.py +++ b/src/dashboard/src/fpr/migrations/0049_add_jhove_validation_for_WAVE.py @@ -1,3 +1,4 @@ +from archivematicaFunctions import jhove_validation_command from django.db import migrations JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" @@ -21,94 +22,12 @@ "ff989185-1b11-4f96-8075-e605e4cf4be4", "42f3756b-7966-4a47-b029-59688bfc6e43", "c6d7590f-83c1-4612-a300-3bff3d358199", + "ecd66812-c89a-4231-802e-2e69b47bae2a", + "88cb0134-7808-450f-a0f4-365a818d583a", + "ffa25cf6-c1a5-45f2-9bee-798aa04df172", ) -NEW_JHOVE_VALIDATION_CMD = r""" -import json -import subprocess -import sys - -from lxml import etree - -class JhoveException(Exception): - pass - -def parse_jhove_data(target): - args = ['jhove', '-h', 'xml', '-m', 'WAVE-hul', target] - try: - output = subprocess.check_output(args).decode("utf8") - except subprocess.CalledProcessError: - raise JhoveException("Jhove failed when running: " + ' '.join(args)) - - return etree.fromstring(output.encode("utf8")) - -def get_status(doc): - status = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}status') - if status is None: - raise JhoveException("Unable to find status!") - - return status.text - -def get_outcome(status, format=None): - # JHOVE returns "bytestream" for unrecognized file formats. - # That can include unrecognized or malformed PDFs, JPEG2000s, etc. - # Since we're whitelisting the formats we're passing in, - # "bytestream" indicates that the format is not in fact well-formed - # regardless of what the status reads. - if format == "bytestream": - return "partial pass" - - if status == "Well-Formed and valid": - return "pass" - elif status == "Well-Formed, but not valid": - return "partial pass" - else: - return "fail" - -def get_format(doc): - format = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}format') - version = doc.find('.{http://schema.openpreservation.org/ois/xml/ns/jhove}repInfo/{http://schema.openpreservation.org/ois/xml/ns/jhove}version') - - if format is None: - format = "Not detected" - else: - format = format.text - - if version is not None: - version = version.text - - return (format, version) - -def format_event_outcome_detail_note(format, version, result): - note = 'format="{}";'.format(format) - if version is not None: - note = note + ' version="{}";'.format(version) - note = note + ' result="{}"'.format(result) - - return note - -def main(target): - try: - doc = parse_jhove_data(target) - status = get_status(doc) - format, version = get_format(doc) - outcome = get_outcome(status, format) - note = format_event_outcome_detail_note(format, version, status) - - out = { - "eventOutcomeInformation": outcome, - "eventOutcomeDetailNote": note - } - print(json.dumps(out)) - - return 0 - except JhoveException as e: - return e - -if __name__ == '__main__': - target = sys.argv[1] - sys.exit(main(target)) -""" +module = "WAVE-hul" def data_migration_up(apps, schema_editor): @@ -116,7 +35,7 @@ def data_migration_up(apps, schema_editor): _add_fp_command_for_wave_validation( apps, NEW_JHOVE_CMD_ID, - NEW_JHOVE_VALIDATION_CMD, + jhove_validation_command(module), JHOVE_VALIDATION_RULES_OF_WAVE, ) @@ -132,7 +51,7 @@ def _add_fp_command_for_wave_validation(apps, new_cmd_uuid, new_cmd, rule_uuids) command=new_cmd, script_type="pythonScript", command_usage="validation", - description="Validate WAVE using JHOVE", + description="Validate using JHOVE WAVE-hul", ) # Update existing rules diff --git a/src/dashboard/src/fpr/migrations/0050_add_jhove_validation_for_AIFF.py b/src/dashboard/src/fpr/migrations/0050_add_jhove_validation_for_AIFF.py new file mode 100644 index 0000000000..d0bed0b645 --- /dev/null +++ b/src/dashboard/src/fpr/migrations/0050_add_jhove_validation_for_AIFF.py @@ -0,0 +1,49 @@ +from archivematicaFunctions import jhove_validation_command +from django.db import migrations + +JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" +NEW_JHOVE_CMD_ID = "d610e7d6-f78c-4fb8-bd19-09a10fa48672" + +JHOVE_VALIDATION_RULES_OF_AIFF = ( + "76bfa370-ac87-41e2-995b-e01bb8c977d0", + "7af37625-f547-4d13-ab52-e5bddf249027", + "fcefa9af-322c-4c9b-afd2-82231dd953fc", + "407dcd55-71f5-4d83-8e21-6e3809a3fba8", +) + +module = "AIFF-hul" + + +def data_migration_up(apps, schema_editor): + """Update commands and rules.""" + _add_fp_command_for_aiff_validation( + apps, + NEW_JHOVE_CMD_ID, + jhove_validation_command(module), + JHOVE_VALIDATION_RULES_OF_AIFF, + ) + + +def _add_fp_command_for_aiff_validation(apps, new_cmd_uuid, new_cmd, rule_uuids): + FPCommand = apps.get_model("fpr", "FPCommand") + FPRule = apps.get_model("fpr", "FPRule") + + # Add new command with the following + FPCommand.objects.create( + uuid=new_cmd_uuid, + tool_id=JHOVE_TOOL_ID, + command=new_cmd, + script_type="pythonScript", + command_usage="validation", + description="Validate using JHOVE AIFF-hul", + ) + + # Update existing rules/ + FPRule.objects.filter(uuid__in=rule_uuids).update( + command_id=new_cmd_uuid, + ) + + +class Migration(migrations.Migration): + dependencies = [("fpr", "0049_add_jhove_validation_for_WAVE")] + operations = [migrations.RunPython(data_migration_up)] diff --git a/src/dashboard/src/fpr/migrations/0051_add_jhove_validation_for_GIF.py b/src/dashboard/src/fpr/migrations/0051_add_jhove_validation_for_GIF.py new file mode 100644 index 0000000000..1d7ead494e --- /dev/null +++ b/src/dashboard/src/fpr/migrations/0051_add_jhove_validation_for_GIF.py @@ -0,0 +1,48 @@ +from archivematicaFunctions import jhove_validation_command +from django.db import migrations + +JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" +NEW_JHOVE_CMD_ID = "8816399f-ca5d-4861-9ae9-a4d22cc89e63" + +JHOVE_VALIDATION_RULES_OF_GIF = ( + "4324a41f-6016-4b28-a0b0-c343dbaca42e", + "6217dbf1-2b4f-49ce-ab87-d0ed1e1ef890", + "986b53a8-3407-4d87-89ec-20575e61292a", +) + +module = "gif-hul" + + +def data_migration_up(apps, schema_editor): + """Update commands and rules.""" + _add_fp_command_for_gif_validation( + apps, + NEW_JHOVE_CMD_ID, + jhove_validation_command(module), + JHOVE_VALIDATION_RULES_OF_GIF, + ) + + +def _add_fp_command_for_gif_validation(apps, new_cmd_uuid, new_cmd, rule_uuids): + FPCommand = apps.get_model("fpr", "FPCommand") + FPRule = apps.get_model("fpr", "FPRule") + + # Add new command with the following + FPCommand.objects.create( + uuid=new_cmd_uuid, + tool_id=JHOVE_TOOL_ID, + command=new_cmd, + script_type="pythonScript", + command_usage="validation", + description="Validate using JHOVE GIF-hul", + ) + + # Update existing rules + FPRule.objects.filter(uuid__in=rule_uuids).update( + command_id=new_cmd_uuid, + ) + + +class Migration(migrations.Migration): + dependencies = [("fpr", "0050_add_jhove_validation_for_AIFF")] + operations = [migrations.RunPython(data_migration_up)] diff --git a/src/dashboard/src/fpr/migrations/0052_add_jhove_validation_for_WARC.py b/src/dashboard/src/fpr/migrations/0052_add_jhove_validation_for_WARC.py new file mode 100644 index 0000000000..03274c736f --- /dev/null +++ b/src/dashboard/src/fpr/migrations/0052_add_jhove_validation_for_WARC.py @@ -0,0 +1,48 @@ +from archivematicaFunctions import jhove_validation_command +from django.db import migrations + +JHOVE_TOOL_ID = "085d8690-93b7-4d31-84f7-2c5f4cbf6735" +NEW_JHOVE_CMD_ID = "228f6676-97d7-44ff-bdb8-bfe181dee1a1" + +JHOVE_VALIDATION_RULES_OF_WARC = ( + "886eeaba-55b3-4ed9-9441-59aa1454ecdc", + "7497c57b-ee7a-420f-b197-e1752a0f071f", + "27c0dabc-fda4-4060-ab77-ce6e86f424c8", +) + +module = "WARC-kb" + + +def data_migration_up(apps, schema_editor): + """Update commands and rules.""" + _add_fp_command_for_warc_validation( + apps, + NEW_JHOVE_CMD_ID, + jhove_validation_command(module), + JHOVE_VALIDATION_RULES_OF_WARC, + ) + + +def _add_fp_command_for_warc_validation(apps, new_cmd_uuid, new_cmd, rule_uuids): + FPCommand = apps.get_model("fpr", "FPCommand") + FPRule = apps.get_model("fpr", "FPRule") + + # Add new command with the following + FPCommand.objects.create( + uuid=new_cmd_uuid, + tool_id=JHOVE_TOOL_ID, + command=new_cmd, + script_type="pythonScript", + command_usage="validation", + description="Validate using JHOVE WARC-kb", + ) + + # Update existing rules + FPRule.objects.filter(uuid__in=rule_uuids).update( + command_id=new_cmd_uuid, + ) + + +class Migration(migrations.Migration): + dependencies = [("fpr", "0051_add_jhove_validation_for_GIF")] + operations = [migrations.RunPython(data_migration_up)] From f929f7e6070ba4222b3b8ca3105876068f528e16 Mon Sep 17 00:00:00 2001 From: Dhwani Patel Date: Tue, 28 Jan 2025 12:03:36 -0600 Subject: [PATCH 6/7] Update submodules --- hack/submodules/archivematica-sampledata | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/submodules/archivematica-sampledata b/hack/submodules/archivematica-sampledata index 419a8bf3ad..5b4ee5a32f 160000 --- a/hack/submodules/archivematica-sampledata +++ b/hack/submodules/archivematica-sampledata @@ -1 +1 @@ -Subproject commit 419a8bf3adfa8611d4d2a8f554da633aa3ca5d08 +Subproject commit 5b4ee5a32f75199d844d815f79449b2201f08c8b From de4a37c3b3ca54e073c473ef0bdc89b55ef5620c Mon Sep 17 00:00:00 2001 From: Dhwani Patel Date: Tue, 4 Feb 2025 11:23:31 -0600 Subject: [PATCH 7/7] Update AMAUAT submodule --- hack/submodules/archivematica-acceptance-tests | 2 +- hack/submodules/archivematica-sampledata | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hack/submodules/archivematica-acceptance-tests b/hack/submodules/archivematica-acceptance-tests index 4734e1a1b5..827fba4dff 160000 --- a/hack/submodules/archivematica-acceptance-tests +++ b/hack/submodules/archivematica-acceptance-tests @@ -1 +1 @@ -Subproject commit 4734e1a1b5d893fadffc04d0f6687586ef5c6d1e +Subproject commit 827fba4dffb1bb7820ac64a5fa645f0fa6e6291f diff --git a/hack/submodules/archivematica-sampledata b/hack/submodules/archivematica-sampledata index 5b4ee5a32f..aaeda2d55a 160000 --- a/hack/submodules/archivematica-sampledata +++ b/hack/submodules/archivematica-sampledata @@ -1 +1 @@ -Subproject commit 5b4ee5a32f75199d844d815f79449b2201f08c8b +Subproject commit aaeda2d55ac737a9be9b74a8f7000f664ee64d2d