From 981716ccff69dbb0d07b3cc3c5847ee19f5be277 Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 20 Sep 2022 15:20:47 +0100 Subject: [PATCH 01/10] Sequential bulk document support in cli Basic implementation of bulk document support in dangerzone-cli. Usage: dangerzone-cli [OPTIONS] doc1.pdf doc2.pdf --- dangerzone/args.py | 20 +++++++++++++- dangerzone/cli.py | 51 ++++++++++++++++++++--------------- dangerzone/document.py | 40 ++++++++++++++++++++++----- dangerzone/gui/main_window.py | 3 --- dangerzone/logic.py | 34 ++++++++++++++++++++++- tests/test_document.py | 21 +++++++++++++++ 6 files changed, 137 insertions(+), 32 deletions(-) diff --git a/dangerzone/args.py b/dangerzone/args.py index c8070780c..120da65bf 100644 --- a/dangerzone/args.py +++ b/dangerzone/args.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import List, Optional, Tuple import click @@ -17,6 +17,18 @@ def _validate_input_filename( return filename +@errors.handle_document_errors +def _validate_input_filenames( + ctx: click.Context, param: List[str], value: Tuple[str] +) -> List[str]: + normalized_filenames = [] + for filename in value: + filename = Document.normalize_filename(filename) + Document.validate_input_filename(filename) + normalized_filenames.append(filename) + return normalized_filenames + + @errors.handle_document_errors def _validate_output_filename( ctx: click.Context, param: str, value: Optional[str] @@ -42,6 +54,12 @@ def validate_input_filename( return _validate_input_filename(ctx, param, value) +def validate_input_filenames( + ctx: click.Context, param: List[str], value: Tuple[str] +) -> List[str]: + return _validate_input_filenames(ctx, param, value) + + def validate_output_filename( ctx: click.Context, param: str, value: Optional[str] ) -> Optional[str]: diff --git a/dangerzone/cli.py b/dangerzone/cli.py index 3270b853c..a06283aca 100644 --- a/dangerzone/cli.py +++ b/dangerzone/cli.py @@ -2,7 +2,7 @@ import logging import os import sys -from typing import Optional +from typing import List, Optional import click from colorama import Back, Fore, Style @@ -26,23 +26,29 @@ def print_header(s: str) -> None: help=f"Default is filename ending with {SAFE_EXTENSION}", ) @click.option("--ocr-lang", help="Language to OCR, defaults to none") -@click.argument("filename", required=True, callback=args.validate_input_filename) +@click.argument( + "filenames", + required=True, + nargs=-1, + type=click.UNPROCESSED, + callback=args.validate_input_filenames, +) @errors.handle_document_errors def cli_main( - output_filename: Optional[str], ocr_lang: Optional[str], filename: str + output_filename: Optional[str], ocr_lang: Optional[str], filenames: List[str] ) -> None: setup_logging() dangerzone = DangerzoneCore() display_banner() - - document = Document(filename) - - # Set PDF output filename - if output_filename: - document.output_filename = output_filename + if len(filenames) == 1 and output_filename: + dangerzone.add_document(filenames[0], output_filename) + elif len(filenames) > 1 and output_filename: + click.echo("--output-filename can only be used with one input file.") + exit(1) else: - document.set_default_output_filename() + for filename in filenames: + dangerzone.add_document(filename) # Validate OCR language if ocr_lang: @@ -75,18 +81,21 @@ def stdout_callback(line: str) -> None: except: click.echo(f"Invalid JSON returned from container: {line}") - if convert( - document.input_filename, - document.output_filename, - ocr_lang, - stdout_callback, - ): - print_header("Safe PDF created successfully") - click.echo(document.output_filename) - exit(0) - else: - print_header("Failed to convert document") + dangerzone.convert_documents(ocr_lang, stdout_callback) + documents_safe = dangerzone.get_safe_documents() + documents_failed = dangerzone.get_failed_documents() + + if documents_safe != []: + print_header("Safe PDF(s) created successfully") + for document in documents_safe: + click.echo(document.output_filename) + if documents_failed != []: + print_header("Failed to convert document(s)") + for document in documents_failed: + click.echo(document.input_filename) exit(1) + else: + exit(0) def setup_logging() -> None: diff --git a/dangerzone/document.py b/dangerzone/document.py index 6cfc28ff2..9fb0b12df 100644 --- a/dangerzone/document.py +++ b/dangerzone/document.py @@ -1,3 +1,4 @@ +import enum import os import platform import stat @@ -18,13 +19,23 @@ class Document: document, and validating its info. """ - def __init__(self, input_filename: str = None) -> None: + # document conversion state + STATE_UNCONVERTED = enum.auto() + STATE_SAFE = enum.auto() + STATE_FAILED = enum.auto() + + def __init__(self, input_filename: str = None, output_filename: str = None) -> None: self._input_filename: Optional[str] = None self._output_filename: Optional[str] = None if input_filename: self.input_filename = input_filename + if output_filename: + self.output_filename = output_filename + + self.state = Document.STATE_UNCONVERTED + @staticmethod def normalize_filename(filename: str) -> str: return os.path.abspath(filename) @@ -68,7 +79,10 @@ def input_filename(self, filename: str) -> None: @property def output_filename(self) -> str: if self._output_filename is None: - raise DocumentFilenameException("Output filename has not been set yet.") + if self._input_filename is not None: + return self.default_output_filename + else: + raise DocumentFilenameException("Output filename has not been set yet.") else: return self._output_filename @@ -78,7 +92,21 @@ def output_filename(self, filename: str) -> None: self.validate_output_filename(filename) self._output_filename = filename - def set_default_output_filename(self) -> None: - self.output_filename = ( - f"{os.path.splitext(self.input_filename)[0]}{SAFE_EXTENSION}" - ) + @property + def default_output_filename(self) -> str: + return f"{os.path.splitext(self.input_filename)[0]}{SAFE_EXTENSION}" + + def is_unconverted(self) -> bool: + return self.state is Document.STATE_UNCONVERTED + + def is_failed(self) -> bool: + return self.state is Document.STATE_FAILED + + def is_safe(self) -> bool: + return self.state is Document.STATE_SAFE + + def mark_as_failed(self) -> None: + self.state = Document.STATE_FAILED + + def mark_as_safe(self) -> None: + self.state = Document.STATE_SAFE diff --git a/dangerzone/gui/main_window.py b/dangerzone/gui/main_window.py index 2e2e2d82b..16689e72d 100644 --- a/dangerzone/gui/main_window.py +++ b/dangerzone/gui/main_window.py @@ -433,9 +433,6 @@ def document_selected(self) -> None: self.dangerous_doc_label.setText( f"Suspicious: {os.path.basename(self.document.input_filename)}" ) - - # Set the default save location - self.document.set_default_output_filename() self.save_lineedit.setText(os.path.basename(self.document.output_filename)) def save_browse_button_clicked(self) -> None: diff --git a/dangerzone/logic.py b/dangerzone/logic.py index ec53fae0c..d2a044d8d 100644 --- a/dangerzone/logic.py +++ b/dangerzone/logic.py @@ -6,12 +6,13 @@ import shutil import subprocess import sys -from typing import Optional +from typing import Callable, List, Optional import appdirs import colorama from .container import convert +from .document import Document from .settings import Settings from .util import get_resource_path @@ -36,3 +37,34 @@ def __init__(self) -> None: # Load settings self.settings = Settings(self) + + self.documents: List[Document] = [] + + def add_document( + self, input_filename: str, output_filename: Optional[str] = None + ) -> None: + doc = Document(input_filename, output_filename) + self.documents.append(doc) + + def convert_documents( + self, ocr_lang: Optional[str], stdout_callback: Callable[[str], None] + ) -> None: + all_successful = True + + for document in self.documents: + success = convert( + document.input_filename, + document.output_filename, + ocr_lang, + stdout_callback, + ) + if success: + document.mark_as_safe() + else: + document.mark_as_failed() + + def get_safe_documents(self) -> List[Document]: + return [doc for doc in self.documents if doc.is_safe()] + + def get_failed_documents(self) -> List[Document]: + return [doc for doc in self.documents if doc.is_failed()] diff --git a/tests/test_document.py b/tests/test_document.py index 666f465fc..fe4d1dc7a 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -78,3 +78,24 @@ def test_output_file_not_pdf(tmp_path: Path) -> None: assert "Safe PDF filename must end in '.pdf'" in str(e.value) assert not os.path.exists(docx_file) + + +def test_is_unconverted_by_default(sample_doc: None) -> None: + d = Document(sample_doc) + assert d.is_unconverted() + + +def test_mark_as_safe(sample_doc: str) -> None: + d = Document(sample_doc) + d.mark_as_safe() + assert d.is_safe() + assert not d.is_failed() + assert not d.is_unconverted() + + +def test_mark_as_failed(sample_doc: str) -> None: + d = Document(sample_doc) + d.mark_as_failed() + assert d.is_failed() + assert not d.is_safe() + assert not d.is_unconverted() From f9b564be037fc2c7875c7b180ee6f002f8a9db53 Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 1 Nov 2022 12:55:02 +0000 Subject: [PATCH 02/10] Security: cli wildcard injection mitigation Wildcard arguments like `*` can lead to security vulnerabilities if files are maliciously named as would-be parameters. In the following scenario if a file in the current directory was named '--help', running the following command would show the help. $ dangerzone-cli * By checking if parameters also happen to be files, we mitigate this risk and have a chance to warn the user. --- dangerzone/args.py | 42 ++++++++++++++++++++++++++++++++++++++++++ dangerzone/cli.py | 11 +++++++---- dangerzone/errors.py | 2 +- tests/test_cli.py | 39 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 87 insertions(+), 7 deletions(-) diff --git a/dangerzone/args.py b/dangerzone/args.py index 120da65bf..8deb01db8 100644 --- a/dangerzone/args.py +++ b/dangerzone/args.py @@ -1,3 +1,5 @@ +import functools +import os from typing import List, Optional, Tuple import click @@ -64,3 +66,43 @@ def validate_output_filename( ctx: click.Context, param: str, value: Optional[str] ) -> Optional[str]: return _validate_output_filename(ctx, param, value) + + +def check_suspicious_options(args: List[str]) -> None: + options = set([arg for arg in args if arg.startswith("-")]) + try: + files = set(os.listdir()) + except Exception: + # If we can list files in the current working directory, this means that + # we're probably in an unlinked directory. Dangerzone should still work in + # this case, so we should return here. + return + + intersection = options & files + if intersection: + filenames_str = ", ".join(intersection) + msg = ( + f"Security: Detected CLI options that are also present as files in the" + f" current working directory: {filenames_str}" + ) + click.echo(msg) + exit(1) + + +def override_parser_and_check_suspicious_options(click_main: click.Command) -> None: + """Override the argument parsing logic of Click. + + Click does not allow us to have access to the raw arguments that it receives (either + from sys.argv or from its testing module). To circumvent this, we can override its + `Command.parse_args()` method, which is public and should be safe to do so. + + We can use it to check for any suspicious options prior to arg parsing. + """ + orig_parse_fn = click_main.parse_args + + @functools.wraps(orig_parse_fn) + def custom_parse_fn(ctx: click.Context, args: List[str]) -> List[str]: + check_suspicious_options(args) + return orig_parse_fn(ctx, args) + + click_main.parse_args = custom_parse_fn # type: ignore [assignment] diff --git a/dangerzone/cli.py b/dangerzone/cli.py index a06283aca..e06d891ef 100644 --- a/dangerzone/cli.py +++ b/dangerzone/cli.py @@ -1,18 +1,18 @@ import json import logging -import os import sys -from typing import List, Optional +from typing import Any, Callable, List, Optional, TypeVar import click from colorama import Back, Fore, Style from . import args, container, errors -from .container import convert -from .document import SAFE_EXTENSION, Document +from .document import SAFE_EXTENSION from .logic import DangerzoneCore from .util import get_version +F = TypeVar("F", bound=Callable[..., Any]) + def print_header(s: str) -> None: click.echo("") @@ -98,6 +98,9 @@ def stdout_callback(line: str) -> None: exit(0) +args.override_parser_and_check_suspicious_options(cli_main) + + def setup_logging() -> None: if getattr(sys, "dangerzone_dev", True): fmt = "%(message)s" diff --git a/dangerzone/errors.py b/dangerzone/errors.py index 98b27592d..a26d213ff 100644 --- a/dangerzone/errors.py +++ b/dangerzone/errors.py @@ -1,7 +1,7 @@ import functools import logging import sys -from typing import Any, Callable, TypeVar, cast +from typing import Any, Callable, Sequence, TypeVar, cast import click diff --git a/tests/test_cli.py b/tests/test_cli.py index 257388bd7..c31fc23c2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import copy import os import shutil @@ -106,7 +107,9 @@ def __str__(self) -> str: class TestCli(TestBase): - def run_cli(self, args: Sequence[str] | str = ()) -> CLIResult: + def run_cli( + self, args: Sequence[str] | str = (), tmp_path: Path = None + ) -> CLIResult: """Run the CLI with the provided arguments. Callers can either provide a list of arguments (iterable), or a single @@ -120,7 +123,20 @@ def run_cli(self, args: Sequence[str] | str = ()) -> CLIResult: # Convert the single argument to a tuple, else Click will attempt # to tokenize it. args = (args,) - result = CliRunner().invoke(cli_main, args) + + # TODO: Replace this with `contextlib.chdir()` [1], which was added in + # Python 3.11. + # + # [1]: # https://docs.python.org/3/library/contextlib.html#contextlib.chdir + try: + if tmp_path is not None: + cwd = os.getcwd() + os.chdir(tmp_path) + result = CliRunner().invoke(cli_main, args) + finally: + if tmp_path is not None: + os.chdir(cwd) + return CLIResult.reclass_click_result(result, args) @@ -197,3 +213,22 @@ def test_filenames(self, filename: str, tmp_path: Path) -> None: result = self.run_cli(doc_path) result.assert_success() assert len(os.listdir(tmp_path)) == 2 + + +class TestSecurity(TestCli): + def test_suspicious_double_dash_file(self, tmp_path: Path) -> None: + """Protection against "dangeronze-cli *" and files named --option.""" + file_path = tmp_path / "--ocr-lang" + file_path.touch() + result = self.run_cli(["--ocr-lang", "eng"], tmp_path) + result.assert_failure(message="Security: Detected CLI options that are also") + + def test_suspicious_double_dash_and_equals_file(self, tmp_path: Path) -> None: + """Protection against "dangeronze-cli *" and files named --option=value.""" + file_path = tmp_path / "--output-filename=bad" + file_path.touch() + result = self.run_cli(["--output-filename=bad", "eng"], tmp_path) + result.assert_failure(message="Security: Detected CLI options that are also") + + # TODO: Check that this applies for single dash arguments, and concatenated + # single dash arguments, once Dangerzone supports them. From e17912888a9414c958d1e3f4c230178e7a3ad37b Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 20 Sep 2022 18:12:49 +0100 Subject: [PATCH 03/10] Add test cases for bulk document conversion --- tests/test_cli.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index c31fc23c2..4db575d11 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -214,6 +214,29 @@ def test_filenames(self, filename: str, tmp_path: Path) -> None: result.assert_success() assert len(os.listdir(tmp_path)) == 2 + def test_bulk(self, tmp_path: Path) -> None: + filenames = ["1.pdf", "2.pdf", "3.pdf"] + file_paths = [] + for filename in filenames: + doc_path = str(tmp_path / filename) + shutil.copyfile(self.sample_doc, doc_path) + file_paths.append(doc_path) + + result = self.run_cli(file_paths) + result.assert_success() + assert len(os.listdir(tmp_path)) == 2 * len(filenames) + + def test_bulk_fail_on_output_filename(self, tmp_path: Path) -> None: + filenames = ["1.pdf", "2.pdf", "3.pdf"] + file_paths = [] + for filename in filenames: + doc_path = str(tmp_path / filename) + shutil.copyfile(self.sample_doc, doc_path) + file_paths.append(doc_path) + + result = self.run_cli(['--output-filename="output.pdf"'] + file_paths) + result.assert_failure() + class TestSecurity(TestCli): def test_suspicious_double_dash_file(self, tmp_path: Path) -> None: From 2d587f408285974756b0448f5b88f00597fba5c0 Mon Sep 17 00:00:00 2001 From: deeplow Date: Thu, 22 Sep 2022 15:10:51 +0100 Subject: [PATCH 04/10] Parallel cli bulk conversions via threading Initial parallel document conversion: creates a pool of N threads defined by the setting 'parallel_conversions'. Each thread calls convert() on a document. --- dangerzone/container.py | 21 +++++++++++++++++++++ dangerzone/logic.py | 13 ++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/dangerzone/container.py b/dangerzone/container.py index 8d40314dd..2558a6565 100644 --- a/dangerzone/container.py +++ b/dangerzone/container.py @@ -254,6 +254,27 @@ def convert( return success +def get_max_parallel_conversions() -> int: + n_cpu = 1 + if platform.system() == "Linux": + # if on linux containers run natively + cpu_count = os.cpu_count() + if cpu_count is not None: + n_cpu = cpu_count + + elif get_runtime_name() == "docker": + # For Windows and MacOS containers run in VM + # So we obtain the CPU count for the VM + n_cpu_str = subprocess.check_output( + [get_runtime(), "info", "--format", "{{.NCPU}}"], + text=True, + startupinfo=get_subprocess_startupinfo(), + ) + n_cpu = int(n_cpu_str.strip()) + + return 2 * n_cpu + 1 + + # From global_common: # def validate_convert_to_pixel_output(self, common, output): diff --git a/dangerzone/logic.py b/dangerzone/logic.py index d2a044d8d..1c82b7c82 100644 --- a/dangerzone/logic.py +++ b/dangerzone/logic.py @@ -1,3 +1,4 @@ +import concurrent.futures import gzip import json import logging @@ -11,7 +12,7 @@ import appdirs import colorama -from .container import convert +from . import container from .document import Document from .settings import Settings from .util import get_resource_path @@ -49,10 +50,8 @@ def add_document( def convert_documents( self, ocr_lang: Optional[str], stdout_callback: Callable[[str], None] ) -> None: - all_successful = True - - for document in self.documents: - success = convert( + def convert_doc(document: Document) -> None: + success = container.convert( document.input_filename, document.output_filename, ocr_lang, @@ -63,6 +62,10 @@ def convert_documents( else: document.mark_as_failed() + max_jobs = container.get_max_parallel_conversions() + with concurrent.futures.ThreadPoolExecutor(max_workers=max_jobs) as executor: + executor.map(convert_doc, self.documents) + def get_safe_documents(self) -> List[Document]: return [doc for doc in self.documents if doc.is_safe()] From 6d2fdf0afe993b7963e464c4fd7c7bd5e1d80d41 Mon Sep 17 00:00:00 2001 From: deeplow Date: Mon, 26 Sep 2022 10:39:59 +0100 Subject: [PATCH 05/10] Deduplicate container output parsing (stdout_callback) The container output logging logic was in both the CLI and the GUI. This change moves the core parsing logic to container.py. Since the code was largely the same, now cli does need to specify a stdout_callback since all the necessary logging already happens. The GUI now only adds an stdout_callback to detect if there was an error during the conversion process. --- dangerzone/cli.py | 15 +-------- dangerzone/container.py | 62 ++++++++++++++++++++++++++--------- dangerzone/gui/main_window.py | 25 +++----------- dangerzone/logic.py | 5 ++- 4 files changed, 54 insertions(+), 53 deletions(-) diff --git a/dangerzone/cli.py b/dangerzone/cli.py index e06d891ef..d0f6580ab 100644 --- a/dangerzone/cli.py +++ b/dangerzone/cli.py @@ -1,4 +1,3 @@ -import json import logging import sys from typing import Any, Callable, List, Optional, TypeVar @@ -69,19 +68,7 @@ def cli_main( # Convert the document print_header("Converting document to safe PDF") - def stdout_callback(line: str) -> None: - try: - status = json.loads(line) - s = Style.BRIGHT + Fore.CYAN + f"{status['percentage']}% " - if status["error"]: - s += Style.RESET_ALL + Fore.RED + status["text"] - else: - s += Style.RESET_ALL + status["text"] - click.echo(s) - except: - click.echo(f"Invalid JSON returned from container: {line}") - - dangerzone.convert_documents(ocr_lang, stdout_callback) + dangerzone.convert_documents(ocr_lang) documents_safe = dangerzone.get_safe_documents() documents_failed = dangerzone.get_failed_documents() diff --git a/dangerzone/container.py b/dangerzone/container.py index 2558a6565..f05dc7fc2 100644 --- a/dangerzone/container.py +++ b/dangerzone/container.py @@ -1,4 +1,5 @@ import gzip +import json import logging import os import pipes @@ -6,10 +7,12 @@ import shutil import subprocess import tempfile -from typing import Callable, List, Optional +from typing import Callable, List, Optional, Tuple import appdirs +from colorama import Fore, Style +from .document import Document from .util import get_resource_path, get_subprocess_startupinfo container_name = "dangerzone.rocks/dangerzone" @@ -127,7 +130,34 @@ def is_container_installed() -> bool: return installed -def exec(args: List[str], stdout_callback: Callable[[str], None] = None) -> int: +def parse_progress(document: Document, line: str) -> Tuple[bool, str, int]: + """ + Parses a line returned by the container. + """ + try: + status = json.loads(line) + except: + error_message = f"Invalid JSON returned from container:\n\n\t {line}" + log.error(error_message) + return (True, error_message, -1) + + s = Style.BRIGHT + Fore.YELLOW + f"[doc {document.id}] " + s += Fore.CYAN + f"{status['percentage']}% " + if status["error"]: + s += Style.RESET_ALL + Fore.RED + status["text"] + log.error(s) + else: + s += Style.RESET_ALL + status["text"] + log.info(s) + + return (status["error"], status["text"], status["percentage"]) + + +def exec( + document: Document, + args: List[str], + stdout_callback: Optional[Callable] = None, +) -> int: args_str = " ".join(pipes.quote(s) for s in args) log.info("> " + args_str) @@ -140,18 +170,21 @@ def exec(args: List[str], stdout_callback: Callable[[str], None] = None) -> int: universal_newlines=True, startupinfo=startupinfo, ) as p: - if stdout_callback and p.stdout is not None: + if p.stdout is not None: for line in p.stdout: - stdout_callback(line) + (error, text, percentage) = parse_progress(document, line) + if stdout_callback: + stdout_callback(error, text, percentage) p.communicate() return p.returncode def exec_container( + document: Document, command: List[str], extra_args: List[str] = [], - stdout_callback: Callable[[str], None] = None, + stdout_callback: Optional[Callable] = None, ) -> int: container_runtime = get_runtime() @@ -181,14 +214,13 @@ def exec_container( ) args = [container_runtime] + args - return exec(args, stdout_callback) + return exec(document, args, stdout_callback) def convert( - input_filename: str, - output_filename: str, + document: Document, ocr_lang: Optional[str], - stdout_callback: Callable[[str], None], + stdout_callback: Optional[Callable] = None, ) -> bool: success = False @@ -210,11 +242,11 @@ def convert( command = ["/usr/bin/python3", "/usr/local/bin/dangerzone.py", "document-to-pixels"] extra_args = [ "-v", - f"{input_filename}:/tmp/input_file", + f"{document.input_filename}:/tmp/input_file", "-v", f"{pixel_dir}:/dangerzone", ] - ret = exec_container(command, extra_args, stdout_callback) + ret = exec_container(document, command, extra_args, stdout_callback) if ret != 0: log.error("documents-to-pixels failed") else: @@ -232,18 +264,18 @@ def convert( "-e", f"OCR_LANGUAGE={ocr_lang}", ] - ret = exec_container(command, extra_args, stdout_callback) + ret = exec_container(document, command, extra_args, stdout_callback) if ret != 0: log.error("pixels-to-pdf failed") else: # Move the final file to the right place - if os.path.exists(output_filename): - os.remove(output_filename) + if os.path.exists(document.output_filename): + os.remove(document.output_filename) container_output_filename = os.path.join( safe_dir, "safe-output-compressed.pdf" ) - shutil.move(container_output_filename, output_filename) + shutil.move(container_output_filename, document.output_filename) # We did it success = True diff --git a/dangerzone/gui/main_window.py b/dangerzone/gui/main_window.py index 16689e72d..99fc54d6c 100644 --- a/dangerzone/gui/main_window.py +++ b/dangerzone/gui/main_window.py @@ -493,34 +493,17 @@ def run(self) -> None: ocr_lang = None if convert( - self.document.input_filename, - self.document.output_filename, + self.document, ocr_lang, self.stdout_callback, ): self.finished.emit(self.error) - def stdout_callback(self, line: str) -> None: - try: - status = json.loads(line) - except: - log.error(f"Invalid JSON returned from container: {line}") - self.error = True - self.update.emit( - True, f"Invalid JSON returned from container:\n\n{line}", 0 - ) - return - - s = Style.BRIGHT + Fore.CYAN + f"{status['percentage']}% " - if status["error"]: + def stdout_callback(self, error: bool, text: str, percentage: int) -> None: + if error: self.error = True - s += Style.RESET_ALL + Fore.RED + status["text"] - log.error(s) - else: - s += Style.RESET_ALL + status["text"] - log.info(s) - self.update.emit(status["error"], status["text"], status["percentage"]) + self.update.emit(error, text, percentage) class ConvertWidget(QtWidgets.QWidget): diff --git a/dangerzone/logic.py b/dangerzone/logic.py index 1c82b7c82..21fbb69ce 100644 --- a/dangerzone/logic.py +++ b/dangerzone/logic.py @@ -48,12 +48,11 @@ def add_document( self.documents.append(doc) def convert_documents( - self, ocr_lang: Optional[str], stdout_callback: Callable[[str], None] + self, ocr_lang: Optional[str], stdout_callback: Optional[Callable] = None ) -> None: def convert_doc(document: Document) -> None: success = container.convert( - document.input_filename, - document.output_filename, + document, ocr_lang, stdout_callback, ) From 65ac0d19c31fef871a3210429ae6c96a3e70e174 Mon Sep 17 00:00:00 2001 From: deeplow Date: Mon, 26 Sep 2022 13:56:13 +0100 Subject: [PATCH 06/10] Add an identifier to each document With multiple input documents it is possible only one of them has issues. Mentioning the document id can help debug. --- dangerzone/document.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dangerzone/document.py b/dangerzone/document.py index 9fb0b12df..8d044a535 100644 --- a/dangerzone/document.py +++ b/dangerzone/document.py @@ -1,6 +1,8 @@ import enum +import logging import os import platform +import secrets import stat import tempfile from typing import Optional @@ -11,6 +13,8 @@ SAFE_EXTENSION = "-safe.pdf" +log = logging.getLogger(__name__) + class Document: """Track the state of a single document. @@ -25,11 +29,15 @@ class Document: STATE_FAILED = enum.auto() def __init__(self, input_filename: str = None, output_filename: str = None) -> None: + # NOTE: See https://github.com/freedomofpress/dangerzone/pull/216#discussion_r1015449418 + self.id = secrets.token_urlsafe(6)[0:6] + self._input_filename: Optional[str] = None self._output_filename: Optional[str] = None if input_filename: self.input_filename = input_filename + self.announce_id() if output_filename: self.output_filename = output_filename @@ -75,6 +83,7 @@ def input_filename(self, filename: str) -> None: filename = self.normalize_filename(filename) self.validate_input_filename(filename) self._input_filename = filename + self.announce_id() @property def output_filename(self) -> str: @@ -96,6 +105,9 @@ def output_filename(self, filename: str) -> None: def default_output_filename(self) -> str: return f"{os.path.splitext(self.input_filename)[0]}{SAFE_EXTENSION}" + def announce_id(self) -> None: + log.info(f"Assigning ID '{self.id}' to doc '{self.input_filename}'") + def is_unconverted(self) -> bool: return self.state is Document.STATE_UNCONVERTED From d71e230173b82ca5789c81a4bf9b9a5273038637 Mon Sep 17 00:00:00 2001 From: deeplow Date: Mon, 10 Oct 2022 15:43:11 +0100 Subject: [PATCH 07/10] Update document state exclusively in convert() The document's state update is better update in the convert() function. This is because this function is always called for the conversion progress regardless of the frontend. --- dangerzone/container.py | 5 +++++ dangerzone/document.py | 10 ++++++++++ dangerzone/logic.py | 4 ---- tests/test_document.py | 6 ++++++ 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/dangerzone/container.py b/dangerzone/container.py index f05dc7fc2..31a2413d1 100644 --- a/dangerzone/container.py +++ b/dangerzone/container.py @@ -173,6 +173,10 @@ def exec( if p.stdout is not None: for line in p.stdout: (error, text, percentage) = parse_progress(document, line) + if error: + document.mark_as_failed() + if percentage == 100.0: + document.mark_as_safe() if stdout_callback: stdout_callback(error, text, percentage) @@ -223,6 +227,7 @@ def convert( stdout_callback: Optional[Callable] = None, ) -> bool: success = False + document.mark_as_converting() if ocr_lang: ocr = "1" diff --git a/dangerzone/document.py b/dangerzone/document.py index 8d044a535..f825b7818 100644 --- a/dangerzone/document.py +++ b/dangerzone/document.py @@ -25,6 +25,7 @@ class Document: # document conversion state STATE_UNCONVERTED = enum.auto() + STATE_CONVERTING = enum.auto() STATE_SAFE = enum.auto() STATE_FAILED = enum.auto() @@ -111,14 +112,23 @@ def announce_id(self) -> None: def is_unconverted(self) -> bool: return self.state is Document.STATE_UNCONVERTED + def is_converting(self) -> bool: + return self.state is Document.STATE_CONVERTING + def is_failed(self) -> bool: return self.state is Document.STATE_FAILED def is_safe(self) -> bool: return self.state is Document.STATE_SAFE + def mark_as_converting(self) -> None: + log.debug(f"Marking doc {self.id} as 'converting'") + self.state = Document.STATE_CONVERTING + def mark_as_failed(self) -> None: + log.debug(f"Marking doc {self.id} as 'failed'") self.state = Document.STATE_FAILED def mark_as_safe(self) -> None: + log.debug(f"Marking doc {self.id} as 'safe'") self.state = Document.STATE_SAFE diff --git a/dangerzone/logic.py b/dangerzone/logic.py index 21fbb69ce..b92870b48 100644 --- a/dangerzone/logic.py +++ b/dangerzone/logic.py @@ -56,10 +56,6 @@ def convert_doc(document: Document) -> None: ocr_lang, stdout_callback, ) - if success: - document.mark_as_safe() - else: - document.mark_as_failed() max_jobs = container.get_max_parallel_conversions() with concurrent.futures.ThreadPoolExecutor(max_workers=max_jobs) as executor: diff --git a/tests/test_document.py b/tests/test_document.py index fe4d1dc7a..61621ad1f 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -93,6 +93,12 @@ def test_mark_as_safe(sample_doc: str) -> None: assert not d.is_unconverted() +def test_mark_as_converting(sample_doc: str) -> None: + d = Document(sample_doc) + d.mark_as_converting() + assert d.is_converting() + + def test_mark_as_failed(sample_doc: str) -> None: d = Document(sample_doc) d.mark_as_failed() From 1bdbb1959c3aa9e3cecd2a72d0393061bd67d5f9 Mon Sep 17 00:00:00 2001 From: deeplow Date: Tue, 1 Nov 2022 13:54:21 +0000 Subject: [PATCH 08/10] Changelog: add cli multi-doc support --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cab27652..6c3ead82a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Feature: Re-add Fedora 37 support - Feature: Add Debian Bookworm (12) support - Reinstate Ubuntu Focal support ([issue #206](https://github.com/freedomofpress/dangerzone/issues/206)) +- Feature: support multiple input documents in the CLI-version ## Dangerzone 0.3.2 - Bug fix: some non-ascii characters like “ would prevent Dangerzone from working ([issue #144](https://github.com/freedomofpress/dangerzone/issues/144)) From 93f17b3166609d0cece3da5b0bdade08686330e8 Mon Sep 17 00:00:00 2001 From: deeplow Date: Wed, 2 Nov 2022 11:12:05 +0000 Subject: [PATCH 09/10] Specialize DocumentFilenameException() for disambiguation All filename-related exceptions were of class DocumentFilenameException. This made it difficult to disambiguate them. Specializing them makes it it easier for tests to detect which exception in particular we want to verify. --- dangerzone/document.py | 18 +++++++----------- dangerzone/errors.py | 43 ++++++++++++++++++++++++++++++++++++++++++ tests/test_document.py | 19 +++++++------------ 3 files changed, 57 insertions(+), 23 deletions(-) diff --git a/dangerzone/document.py b/dangerzone/document.py index f825b7818..b49b51281 100644 --- a/dangerzone/document.py +++ b/dangerzone/document.py @@ -9,7 +9,7 @@ import appdirs -from .errors import DocumentFilenameException +from . import errors SAFE_EXTENSION = "-safe.pdf" @@ -54,28 +54,24 @@ def validate_input_filename(filename: str) -> None: try: open(filename, "rb") except FileNotFoundError as e: - raise DocumentFilenameException( - "Input file not found: make sure you typed it correctly." - ) from e + raise errors.InputFileNotFoundException() from e except PermissionError as e: - raise DocumentFilenameException( - "You don't have permission to open the input file." - ) from e + raise errors.InputFileNotReadableException() from e @staticmethod def validate_output_filename(filename: str) -> None: if not filename.endswith(".pdf"): - raise DocumentFilenameException("Safe PDF filename must end in '.pdf'") + raise errors.NonPDFOutputFileException() try: with open(filename, "wb"): pass except PermissionError as e: - raise DocumentFilenameException("Safe PDF filename is not writable") from e + raise errors.UnwriteableOutputFileException() from e @property def input_filename(self) -> str: if self._input_filename is None: - raise DocumentFilenameException("Input filename has not been set yet.") + raise errors.NotSetInputFilenameException() else: return self._input_filename @@ -92,7 +88,7 @@ def output_filename(self) -> str: if self._input_filename is not None: return self.default_output_filename else: - raise DocumentFilenameException("Output filename has not been set yet.") + raise errors.NotSetOutputFilenameException() else: return self._output_filename diff --git a/dangerzone/errors.py b/dangerzone/errors.py index a26d213ff..00eb7026a 100644 --- a/dangerzone/errors.py +++ b/dangerzone/errors.py @@ -14,6 +14,49 @@ class DocumentFilenameException(Exception): """Exception for document-related filename errors.""" +class InputFileNotFoundException(DocumentFilenameException): + """Exception for when an input file does not exist.""" + + def __init__(self) -> None: + super().__init__("Input file not found: make sure you typed it correctly.") + + +class InputFileNotReadableException(DocumentFilenameException): + """Exception for when an input file exists but is not readable.""" + + def __init__(self) -> None: + super().__init__("You don't have permission to open the input file.") + + +class NonPDFOutputFileException(DocumentFilenameException): + """Exception for when the output file is not a PDF.""" + + def __init__(self) -> None: + super().__init__("Safe PDF filename must end in '.pdf'") + + +class UnwriteableOutputFileException(DocumentFilenameException): + """Exception for when the output file is not writeable.""" + + def __init__(self) -> None: + super().__init__("Safe PDF filename is not writable") + + +class NotSetInputFilenameException(DocumentFilenameException): + """Exception for when the output filename is set before having an + associated input file.""" + + def __init__(self) -> None: + super().__init__("Input filename has not been set yet.") + + +class NotSetOutputFilenameException(DocumentFilenameException): + """Exception for when the output filename is read before it has been set.""" + + def __init__(self) -> None: + super().__init__("Output filename has not been set yet.") + + def handle_document_errors(func: F) -> F: """Log document-related errors and exit gracefully.""" diff --git a/tests/test_document.py b/tests/test_document.py index 61621ad1f..9e9d23b1f 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -5,8 +5,8 @@ import pytest +from dangerzone import errors from dangerzone.document import Document -from dangerzone.errors import DocumentFilenameException from . import sample_doc, unreadable_pdf, unwriteable_pdf @@ -25,15 +25,13 @@ def test_input_file_none() -> None: Attempts to read a document's filename when no doc has been set """ d = Document() - with pytest.raises(DocumentFilenameException) as e: + with pytest.raises(errors.NotSetInputFilenameException) as e: d.input_filename - assert "Input filename has not been set yet" in str(e.value) def test_input_file_non_existing() -> None: - with pytest.raises(DocumentFilenameException) as e: + with pytest.raises(errors.InputFileNotFoundException) as e: Document("non-existing-file.pdf") - assert "Input file not found" in str(e.value) # XXX: This is not easy to test on Windows, as the file owner can always read it. @@ -41,14 +39,13 @@ def test_input_file_non_existing() -> None: # https://stackoverflow.com/questions/72528318/what-file-permissions-make-a-file-unreadable-by-owner-in-windows @pytest.mark.skipif(platform.system() == "Windows", reason="Unix-specific") def test_input_file_unreadable(unreadable_pdf: str) -> None: - with pytest.raises(DocumentFilenameException) as e: + with pytest.raises(errors.InputFileNotReadableException) as e: Document(unreadable_pdf) - assert "don't have permission to open the input file" in str(e.value) def test_output_file_unwriteable(unwriteable_pdf: str) -> None: d = Document() - with pytest.raises(DocumentFilenameException) as e: + with pytest.raises(errors.UnwriteableOutputFileException) as e: d.output_filename = unwriteable_pdf assert "Safe PDF filename is not writable" in str(e.value) @@ -64,18 +61,16 @@ def test_output_file_none() -> None: Attempts to read a document's filename when no doc has been set """ d = Document() - with pytest.raises(DocumentFilenameException) as e: + with pytest.raises(errors.NotSetOutputFilenameException) as e: d.output_filename - assert "Output filename has not been set yet" in str(e.value) def test_output_file_not_pdf(tmp_path: Path) -> None: docx_file = str(tmp_path.joinpath("document.docx")) d = Document() - with pytest.raises(DocumentFilenameException) as e: + with pytest.raises(errors.NonPDFOutputFileException) as e: d.output_filename = docx_file - assert "Safe PDF filename must end in '.pdf'" in str(e.value) assert not os.path.exists(docx_file) From 0b738ba49021d2e2e1dcba3e8bd55cc7cae2b669 Mon Sep 17 00:00:00 2001 From: deeplow Date: Wed, 2 Nov 2022 16:50:39 +0000 Subject: [PATCH 10/10] Do not create outfile files when checking if writeable Checking if files were writeable created files in the process. In the case where someone adds a list of N files to dangerzone but exits before converting, they would be left with N 0-byte files for the -safe version. Now they don't. Fixes #214 --- CHANGELOG.md | 1 + dangerzone/document.py | 9 ++++----- dangerzone/errors.py | 2 +- tests/__init__.py | 7 ------- tests/test_document.py | 14 ++++++++------ 5 files changed, 14 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c3ead82a..bd4a24132 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Feature: Add Debian Bookworm (12) support - Reinstate Ubuntu Focal support ([issue #206](https://github.com/freedomofpress/dangerzone/issues/206)) - Feature: support multiple input documents in the CLI-version +- Bug fix: Failed execution no longer produces an empty "safe" documents ([issue #214](https://github.com/freedomofpress/dangerzone/issues/214)) ## Dangerzone 0.3.2 - Bug fix: some non-ascii characters like “ would prevent Dangerzone from working ([issue #144](https://github.com/freedomofpress/dangerzone/issues/144)) diff --git a/dangerzone/document.py b/dangerzone/document.py index b49b51281..d1e311c7f 100644 --- a/dangerzone/document.py +++ b/dangerzone/document.py @@ -5,6 +5,7 @@ import secrets import stat import tempfile +from pathlib import Path from typing import Optional import appdirs @@ -62,11 +63,9 @@ def validate_input_filename(filename: str) -> None: def validate_output_filename(filename: str) -> None: if not filename.endswith(".pdf"): raise errors.NonPDFOutputFileException() - try: - with open(filename, "wb"): - pass - except PermissionError as e: - raise errors.UnwriteableOutputFileException() from e + if not os.access(Path(filename).parent, os.W_OK): + # in unwriteable directory + raise errors.UnwriteableOutputDirException() @property def input_filename(self) -> str: diff --git a/dangerzone/errors.py b/dangerzone/errors.py index 00eb7026a..1e6811a34 100644 --- a/dangerzone/errors.py +++ b/dangerzone/errors.py @@ -35,7 +35,7 @@ def __init__(self) -> None: super().__init__("Safe PDF filename must end in '.pdf'") -class UnwriteableOutputFileException(DocumentFilenameException): +class UnwriteableOutputDirException(DocumentFilenameException): """Exception for when the output file is not writeable.""" def __init__(self) -> None: diff --git a/tests/__init__.py b/tests/__init__.py index dd32eacd9..2733a0f4b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -30,13 +30,6 @@ def sample_doc() -> str: return str(test_docs_dir.joinpath(BASIC_SAMPLE)) -@pytest.fixture -def unwriteable_pdf(tmp_path: Path) -> str: - file_path = tmp_path / "document.pdf" - file_path.touch(mode=0o400) - return str(file_path) - - @pytest.fixture def unreadable_pdf(tmp_path: Path) -> str: file_path = tmp_path / "document.pdf" diff --git a/tests/test_document.py b/tests/test_document.py index 9e9d23b1f..3b0bdbae9 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -8,7 +8,7 @@ from dangerzone import errors from dangerzone.document import Document -from . import sample_doc, unreadable_pdf, unwriteable_pdf +from . import sample_doc, unreadable_pdf def test_input_sample_init(sample_doc: str) -> None: @@ -43,11 +43,13 @@ def test_input_file_unreadable(unreadable_pdf: str) -> None: Document(unreadable_pdf) -def test_output_file_unwriteable(unwriteable_pdf: str) -> None: - d = Document() - with pytest.raises(errors.UnwriteableOutputFileException) as e: - d.output_filename = unwriteable_pdf - assert "Safe PDF filename is not writable" in str(e.value) +@pytest.mark.skipif(platform.system() == "Windows", reason="Unix-specific") +def test_output_file_unwriteable_dir(sample_doc: str, tmp_path: Path) -> None: + # make parent dir unwriteable + sample_doc_safe = str(tmp_path / "document-safe.pdf") + os.chmod(tmp_path, 0o400) + with pytest.raises(errors.UnwriteableOutputDirException) as e: + d = Document(sample_doc, sample_doc_safe) def test_output(tmp_path: Path) -> None: