Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic via-CLI mutli-document support #216

Merged
merged 10 commits into from
Nov 14, 2022
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
- Feature: Re-add Fedora 37 support
- Feature: Add Debian Bookworm (12) support
- Reinstate Ubuntu Focal support ([issue #206](https://github.com/freedomofpress/dangerzone/issues/206))
- Feature: support multiple input documents in the CLI-version
- Bug fix: Failed execution no longer produces an empty "safe" documents ([issue #214](https://github.com/freedomofpress/dangerzone/issues/214))

## Dangerzone 0.3.2
- Bug fix: some non-ascii characters like “ would prevent Dangerzone from working ([issue #144](https://github.com/freedomofpress/dangerzone/issues/144))
Expand Down
62 changes: 61 additions & 1 deletion dangerzone/args.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import Optional
import functools
import os
from typing import List, Optional, Tuple

import click

Expand All @@ -17,6 +19,18 @@ def _validate_input_filename(
return filename


@errors.handle_document_errors
def _validate_input_filenames(
ctx: click.Context, param: List[str], value: Tuple[str]
) -> List[str]:
normalized_filenames = []
for filename in value:
filename = Document.normalize_filename(filename)
Document.validate_input_filename(filename)
normalized_filenames.append(filename)
return normalized_filenames


@errors.handle_document_errors
def _validate_output_filename(
ctx: click.Context, param: str, value: Optional[str]
Expand All @@ -42,7 +56,53 @@ def validate_input_filename(
return _validate_input_filename(ctx, param, value)


def validate_input_filenames(
ctx: click.Context, param: List[str], value: Tuple[str]
) -> List[str]:
return _validate_input_filenames(ctx, param, value)


def validate_output_filename(
ctx: click.Context, param: str, value: Optional[str]
) -> Optional[str]:
return _validate_output_filename(ctx, param, value)


def check_suspicious_options(args: List[str]) -> None:
options = set([arg for arg in args if arg.startswith("-")])
try:
files = set(os.listdir())
except Exception:
# If we can list files in the current working directory, this means that
# we're probably in an unlinked directory. Dangerzone should still work in
# this case, so we should return here.
return

intersection = options & files
if intersection:
filenames_str = ", ".join(intersection)
msg = (
f"Security: Detected CLI options that are also present as files in the"
f" current working directory: {filenames_str}"
)
click.echo(msg)
exit(1)


def override_parser_and_check_suspicious_options(click_main: click.Command) -> None:
"""Override the argument parsing logic of Click.

Click does not allow us to have access to the raw arguments that it receives (either
from sys.argv or from its testing module). To circumvent this, we can override its
`Command.parse_args()` method, which is public and should be safe to do so.

We can use it to check for any suspicious options prior to arg parsing.
"""
orig_parse_fn = click_main.parse_args

@functools.wraps(orig_parse_fn)
def custom_parse_fn(ctx: click.Context, args: List[str]) -> List[str]:
check_suspicious_options(args)
return orig_parse_fn(ctx, args)

click_main.parse_args = custom_parse_fn # type: ignore [assignment]
71 changes: 35 additions & 36 deletions dangerzone/cli.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import json
import logging
import os
import sys
from typing import Optional
from typing import Any, Callable, List, Optional, TypeVar

import click
from colorama import Back, Fore, Style

from . import args, container, errors
from .container import convert
from .document import SAFE_EXTENSION, Document
from .document import SAFE_EXTENSION
from .logic import DangerzoneCore
from .util import get_version

F = TypeVar("F", bound=Callable[..., Any])


def print_header(s: str) -> None:
click.echo("")
Expand All @@ -26,23 +25,29 @@ def print_header(s: str) -> None:
help=f"Default is filename ending with {SAFE_EXTENSION}",
)
@click.option("--ocr-lang", help="Language to OCR, defaults to none")
@click.argument("filename", required=True, callback=args.validate_input_filename)
@click.argument(
"filenames",
required=True,
nargs=-1,
type=click.UNPROCESSED,
callback=args.validate_input_filenames,
)
@errors.handle_document_errors
def cli_main(
output_filename: Optional[str], ocr_lang: Optional[str], filename: str
output_filename: Optional[str], ocr_lang: Optional[str], filenames: List[str]
) -> None:
setup_logging()
dangerzone = DangerzoneCore()

display_banner()

document = Document(filename)

# Set PDF output filename
if output_filename:
document.output_filename = output_filename
if len(filenames) == 1 and output_filename:
dangerzone.add_document(filenames[0], output_filename)
elif len(filenames) > 1 and output_filename:
click.echo("--output-filename can only be used with one input file.")
exit(1)
else:
document.set_default_output_filename()
for filename in filenames:
dangerzone.add_document(filename)

# Validate OCR language
if ocr_lang:
Expand All @@ -63,30 +68,24 @@ def cli_main(
# Convert the document
print_header("Converting document to safe PDF")

def stdout_callback(line: str) -> None:
try:
status = json.loads(line)
s = Style.BRIGHT + Fore.CYAN + f"{status['percentage']}% "
if status["error"]:
s += Style.RESET_ALL + Fore.RED + status["text"]
else:
s += Style.RESET_ALL + status["text"]
click.echo(s)
except:
click.echo(f"Invalid JSON returned from container: {line}")
dangerzone.convert_documents(ocr_lang)
documents_safe = dangerzone.get_safe_documents()
documents_failed = dangerzone.get_failed_documents()

if convert(
document.input_filename,
document.output_filename,
ocr_lang,
stdout_callback,
):
print_header("Safe PDF created successfully")
click.echo(document.output_filename)
exit(0)
else:
print_header("Failed to convert document")
if documents_safe != []:
print_header("Safe PDF(s) created successfully")
for document in documents_safe:
click.echo(document.output_filename)
if documents_failed != []:
print_header("Failed to convert document(s)")
for document in documents_failed:
click.echo(document.input_filename)
exit(1)
else:
exit(0)


args.override_parser_and_check_suspicious_options(cli_main)


def setup_logging() -> None:
Expand Down
88 changes: 73 additions & 15 deletions dangerzone/container.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import gzip
import json
import logging
import os
import pipes
import platform
import shutil
import subprocess
import tempfile
from typing import Callable, List, Optional
from typing import Callable, List, Optional, Tuple

import appdirs
from colorama import Fore, Style

from .document import Document
from .util import get_resource_path, get_subprocess_startupinfo

container_name = "dangerzone.rocks/dangerzone"
Expand Down Expand Up @@ -127,7 +130,34 @@ def is_container_installed() -> bool:
return installed


def exec(args: List[str], stdout_callback: Callable[[str], None] = None) -> int:
def parse_progress(document: Document, line: str) -> Tuple[bool, str, int]:
"""
Parses a line returned by the container.
"""
try:
status = json.loads(line)
except:
error_message = f"Invalid JSON returned from container:\n\n\t {line}"
log.error(error_message)
return (True, error_message, -1)

s = Style.BRIGHT + Fore.YELLOW + f"[doc {document.id}] "
s += Fore.CYAN + f"{status['percentage']}% "
if status["error"]:
s += Style.RESET_ALL + Fore.RED + status["text"]
log.error(s)
else:
s += Style.RESET_ALL + status["text"]
log.info(s)

return (status["error"], status["text"], status["percentage"])


def exec(
document: Document,
args: List[str],
stdout_callback: Optional[Callable] = None,
) -> int:
args_str = " ".join(pipes.quote(s) for s in args)
log.info("> " + args_str)

Expand All @@ -140,18 +170,25 @@ def exec(args: List[str], stdout_callback: Callable[[str], None] = None) -> int:
universal_newlines=True,
startupinfo=startupinfo,
) as p:
if stdout_callback and p.stdout is not None:
if p.stdout is not None:
for line in p.stdout:
stdout_callback(line)
(error, text, percentage) = parse_progress(document, line)
if error:
document.mark_as_failed()
if percentage == 100.0:
document.mark_as_safe()
if stdout_callback:
stdout_callback(error, text, percentage)

p.communicate()
return p.returncode


def exec_container(
document: Document,
command: List[str],
extra_args: List[str] = [],
stdout_callback: Callable[[str], None] = None,
stdout_callback: Optional[Callable] = None,
) -> int:
container_runtime = get_runtime()

Expand Down Expand Up @@ -181,16 +218,16 @@ def exec_container(
)

args = [container_runtime] + args
return exec(args, stdout_callback)
return exec(document, args, stdout_callback)


def convert(
input_filename: str,
output_filename: str,
document: Document,
ocr_lang: Optional[str],
stdout_callback: Callable[[str], None],
stdout_callback: Optional[Callable] = None,
) -> bool:
success = False
document.mark_as_converting()

if ocr_lang:
ocr = "1"
Expand All @@ -210,11 +247,11 @@ def convert(
command = ["/usr/bin/python3", "/usr/local/bin/dangerzone.py", "document-to-pixels"]
extra_args = [
"-v",
f"{input_filename}:/tmp/input_file",
f"{document.input_filename}:/tmp/input_file",
"-v",
f"{pixel_dir}:/dangerzone",
]
ret = exec_container(command, extra_args, stdout_callback)
ret = exec_container(document, command, extra_args, stdout_callback)
if ret != 0:
log.error("documents-to-pixels failed")
else:
Expand All @@ -232,18 +269,18 @@ def convert(
"-e",
f"OCR_LANGUAGE={ocr_lang}",
]
ret = exec_container(command, extra_args, stdout_callback)
ret = exec_container(document, command, extra_args, stdout_callback)
if ret != 0:
log.error("pixels-to-pdf failed")
else:
# Move the final file to the right place
if os.path.exists(output_filename):
os.remove(output_filename)
if os.path.exists(document.output_filename):
os.remove(document.output_filename)

container_output_filename = os.path.join(
safe_dir, "safe-output-compressed.pdf"
)
shutil.move(container_output_filename, output_filename)
shutil.move(container_output_filename, document.output_filename)

# We did it
success = True
Expand All @@ -254,6 +291,27 @@ def convert(
return success


def get_max_parallel_conversions() -> int:
n_cpu = 1
if platform.system() == "Linux":
# if on linux containers run natively
cpu_count = os.cpu_count()
if cpu_count is not None:
n_cpu = cpu_count

elif get_runtime_name() == "docker":
# For Windows and MacOS containers run in VM
# So we obtain the CPU count for the VM
n_cpu_str = subprocess.check_output(
[get_runtime(), "info", "--format", "{{.NCPU}}"],
text=True,
startupinfo=get_subprocess_startupinfo(),
)
n_cpu = int(n_cpu_str.strip())

return 2 * n_cpu + 1


# From global_common:

# def validate_convert_to_pixel_output(self, common, output):
Expand Down
Loading