From b3d4be211509c641d1ba393cb2fc888c41b5277a Mon Sep 17 00:00:00 2001
From: Robin Windey
Date: Thu, 15 Oct 2020 18:46:37 +0200
Subject: [PATCH 1/8] First working version with OCRmyPDF #32
---
composer.json | 5 +-
composer.lock | 560 ++++++++----------
lib/AppInfo/Application.php | 18 +-
lib/OcrProcessors/PdfOcrProcessor.php | 197 +-----
lib/Wrapper/CommandWrapper.php | 79 +++
lib/Wrapper/FpdiWrapper.php | 78 ---
lib/Wrapper/ICommand.php | 74 +++
lib/Wrapper/IFpdi.php | 36 --
lib/Wrapper/IImagick.php | 43 --
lib/Wrapper/IPdfParser.php | 41 --
lib/Wrapper/ITesseractOcr.php | 37 --
lib/Wrapper/IWrapperFactory.php | 29 -
lib/Wrapper/ImagickWrapper.php | 115 ----
lib/Wrapper/PdfParserWrapper.php | 46 --
lib/Wrapper/TesseractOcrWrapper.php | 69 ---
lib/Wrapper/WrapperFactory.php | 34 --
tests/Unit/AppInfo/ApplicationTest.php | 6 +-
.../OcrProcessors/PdfOcrProcessorTest.php | 199 ++-----
18 files changed, 476 insertions(+), 1190 deletions(-)
create mode 100644 lib/Wrapper/CommandWrapper.php
delete mode 100644 lib/Wrapper/FpdiWrapper.php
create mode 100644 lib/Wrapper/ICommand.php
delete mode 100644 lib/Wrapper/IFpdi.php
delete mode 100644 lib/Wrapper/IImagick.php
delete mode 100644 lib/Wrapper/IPdfParser.php
delete mode 100644 lib/Wrapper/ITesseractOcr.php
delete mode 100644 lib/Wrapper/IWrapperFactory.php
delete mode 100644 lib/Wrapper/ImagickWrapper.php
delete mode 100644 lib/Wrapper/PdfParserWrapper.php
delete mode 100644 lib/Wrapper/TesseractOcrWrapper.php
delete mode 100644 lib/Wrapper/WrapperFactory.php
diff --git a/composer.json b/composer.json
index 34d527a..af7c221 100644
--- a/composer.json
+++ b/composer.json
@@ -1,9 +1,6 @@
{
"require": {
- "thiagoalessio/tesseract_ocr": "^2.9",
- "smalot/pdfparser": "^0.15.0",
- "setasign/fpdi": "^2.3",
- "setasign/fpdf": "^1.8"
+ "mikehaertl/php-shellcommand": "^1.6"
},
"require-dev": {
"phpunit/phpunit": "^8.0",
diff --git a/composer.lock b/composer.lock
index 6840ce6..b073a11 100644
--- a/composer.lock
+++ b/composer.lock
@@ -1,250 +1,35 @@
{
"_readme": [
"This file locks the dependencies of your project to a known state",
- "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
+ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
- "content-hash": "51f9fbef2691c0fe41d0ad74f8376124",
+ "content-hash": "e55f55fa4c0a53d009aa03e19fb4655f",
"packages": [
{
- "name": "setasign/fpdf",
- "version": "1.8.2",
+ "name": "mikehaertl/php-shellcommand",
+ "version": "1.6.2",
"source": {
"type": "git",
- "url": "https://github.com/Setasign/FPDF.git",
- "reference": "d77904018090c17dc9f3ab6e944679a7a47e710a"
+ "url": "https://github.com/mikehaertl/php-shellcommand.git",
+ "reference": "06d6220c77c4632b639f4855f76026c59bceb8aa"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/Setasign/FPDF/zipball/d77904018090c17dc9f3ab6e944679a7a47e710a",
- "reference": "d77904018090c17dc9f3ab6e944679a7a47e710a",
- "shasum": ""
- },
- "type": "library",
- "autoload": {
- "classmap": [
- "fpdf.php"
- ]
- },
- "notification-url": "https://packagist.org/downloads/",
- "license": [
- "MIT"
- ],
- "authors": [
- {
- "name": "Olivier Plathey",
- "email": "oliver@fpdf.org",
- "homepage": "http://fpdf.org/"
- }
- ],
- "description": "FPDF is a PHP class which allows to generate PDF files with pure PHP. F from FPDF stands for Free: you may use it for any kind of usage and modify it to suit your needs.",
- "homepage": "http://www.fpdf.org",
- "keywords": [
- "fpdf",
- "pdf"
- ],
- "time": "2019-12-08T10:32:10+00:00"
- },
- {
- "name": "setasign/fpdi",
- "version": "v2.3.4",
- "source": {
- "type": "git",
- "url": "https://github.com/Setasign/FPDI.git",
- "reference": "2b5fb811c04f937ef257ef3f798cebeded33c136"
- },
- "dist": {
- "type": "zip",
- "url": "https://api.github.com/repos/Setasign/FPDI/zipball/2b5fb811c04f937ef257ef3f798cebeded33c136",
- "reference": "2b5fb811c04f937ef257ef3f798cebeded33c136",
- "shasum": ""
- },
- "require": {
- "ext-zlib": "*",
- "php": "^5.6 || ^7.0"
- },
- "conflict": {
- "setasign/tfpdf": "<1.31"
- },
- "require-dev": {
- "phpunit/phpunit": "~5.7",
- "setasign/fpdf": "~1.8",
- "setasign/tfpdf": "1.31",
- "squizlabs/php_codesniffer": "^3.5",
- "tecnickcom/tcpdf": "~6.2"
- },
- "suggest": {
- "setasign/fpdf": "FPDI will extend this class but as it is also possible to use TCPDF or tFPDF as an alternative. There's no fixed dependency configured."
- },
- "type": "library",
- "autoload": {
- "psr-4": {
- "setasign\\Fpdi\\": "src/"
- }
- },
- "notification-url": "https://packagist.org/downloads/",
- "license": [
- "MIT"
- ],
- "authors": [
- {
- "name": "Jan Slabon",
- "email": "jan.slabon@setasign.com",
- "homepage": "https://www.setasign.com"
- },
- {
- "name": "Maximilian Kresse",
- "email": "maximilian.kresse@setasign.com",
- "homepage": "https://www.setasign.com"
- }
- ],
- "description": "FPDI is a collection of PHP classes facilitating developers to read pages from existing PDF documents and use them as templates in FPDF. Because it is also possible to use FPDI with TCPDF, there are no fixed dependencies defined. Please see suggestions for packages which evaluates the dependencies automatically.",
- "homepage": "https://www.setasign.com/fpdi",
- "keywords": [
- "fpdf",
- "fpdi",
- "pdf"
- ],
- "time": "2020-08-27T06:55:47+00:00"
- },
- {
- "name": "smalot/pdfparser",
- "version": "v0.15.1",
- "source": {
- "type": "git",
- "url": "https://github.com/smalot/pdfparser.git",
- "reference": "6bc9dcbab5154f7d9f4c99e9cd3391f7ba019dc1"
- },
- "dist": {
- "type": "zip",
- "url": "https://api.github.com/repos/smalot/pdfparser/zipball/6bc9dcbab5154f7d9f4c99e9cd3391f7ba019dc1",
- "reference": "6bc9dcbab5154f7d9f4c99e9cd3391f7ba019dc1",
- "shasum": ""
- },
- "require": {
- "ext-mbstring": "*",
- "ext-zlib": "*",
- "php": "^5.6|^7.0",
- "tecnickcom/tcpdf": "^6.2.22"
- },
- "require-dev": {
- "atoum/atoum": "^3.1",
- "friendsofphp/php-cs-fixer": "^2.16.3"
- },
- "type": "library",
- "autoload": {
- "psr-0": {
- "Smalot\\PdfParser\\": "src/"
- }
- },
- "notification-url": "https://packagist.org/downloads/",
- "license": [
- "LGPL-3.0"
- ],
- "authors": [
- {
- "name": "Sebastien MALOT",
- "email": "sebastien@malot.fr"
- }
- ],
- "description": "Pdf parser library. Can read and extract information from pdf file.",
- "homepage": "http://www.pdfparser.org",
- "keywords": [
- "extract",
- "parse",
- "parser",
- "pdf",
- "text"
- ],
- "time": "2020-05-27T07:55:41+00:00"
- },
- {
- "name": "tecnickcom/tcpdf",
- "version": "6.3.5",
- "source": {
- "type": "git",
- "url": "https://github.com/tecnickcom/TCPDF.git",
- "reference": "19a535eaa7fb1c1cac499109deeb1a7a201b4549"
- },
- "dist": {
- "type": "zip",
- "url": "https://api.github.com/repos/tecnickcom/TCPDF/zipball/19a535eaa7fb1c1cac499109deeb1a7a201b4549",
- "reference": "19a535eaa7fb1c1cac499109deeb1a7a201b4549",
- "shasum": ""
- },
- "require": {
- "php": ">=5.3.0"
- },
- "type": "library",
- "autoload": {
- "classmap": [
- "config",
- "include",
- "tcpdf.php",
- "tcpdf_parser.php",
- "tcpdf_import.php",
- "tcpdf_barcodes_1d.php",
- "tcpdf_barcodes_2d.php",
- "include/tcpdf_colors.php",
- "include/tcpdf_filters.php",
- "include/tcpdf_font_data.php",
- "include/tcpdf_fonts.php",
- "include/tcpdf_images.php",
- "include/tcpdf_static.php",
- "include/barcodes/datamatrix.php",
- "include/barcodes/pdf417.php",
- "include/barcodes/qrcode.php"
- ]
- },
- "notification-url": "https://packagist.org/downloads/",
- "license": [
- "LGPL-3.0-only"
- ],
- "authors": [
- {
- "name": "Nicola Asuni",
- "email": "info@tecnick.com",
- "role": "lead"
- }
- ],
- "description": "TCPDF is a PHP class for generating PDF documents and barcodes.",
- "homepage": "http://www.tcpdf.org/",
- "keywords": [
- "PDFD32000-2008",
- "TCPDF",
- "barcodes",
- "datamatrix",
- "pdf",
- "pdf417",
- "qrcode"
- ],
- "time": "2020-02-14T14:20:12+00:00"
- },
- {
- "name": "thiagoalessio/tesseract_ocr",
- "version": "2.9.3",
- "source": {
- "type": "git",
- "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
- "reference": "e932f7410e753434b26b214f3f322933efafa0f0"
- },
- "dist": {
- "type": "zip",
- "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/e932f7410e753434b26b214f3f322933efafa0f0",
- "reference": "e932f7410e753434b26b214f3f322933efafa0f0",
+ "url": "https://api.github.com/repos/mikehaertl/php-shellcommand/zipball/06d6220c77c4632b639f4855f76026c59bceb8aa",
+ "reference": "06d6220c77c4632b639f4855f76026c59bceb8aa",
"shasum": ""
},
"require": {
- "php": "^5.4 || ^7.0"
+ "php": ">= 5.4.0"
},
"require-dev": {
- "codacy/coverage": "dev-master",
- "phpunit/php-code-coverage": "^2.2.4"
+ "phpunit/phpunit": ">4.0 <8"
},
"type": "library",
"autoload": {
"psr-4": {
- "thiagoalessio\\TesseractOCR\\": "src/"
+ "mikehaertl\\shellcommand\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
@@ -253,32 +38,30 @@
],
"authors": [
{
- "name": "thiagoalessio",
- "email": "thiagoalessio@me.com"
+ "name": "Michael Härtl",
+ "email": "haertl.mike@gmail.com"
}
],
- "description": "A wrapper to work with Tesseract OCR inside PHP.",
+ "description": "An object oriented interface to shell commands",
"keywords": [
- "OCR",
- "Tesseract",
- "text recognition"
+ "shell"
],
- "time": "2020-01-27T19:53:35+00:00"
+ "time": "2020-08-30T09:56:40+00:00"
}
],
"packages-dev": [
{
"name": "composer/semver",
- "version": "1.5.1",
+ "version": "1.7.1",
"source": {
"type": "git",
"url": "https://github.com/composer/semver.git",
- "reference": "c6bea70230ef4dd483e6bbcab6005f682ed3a8de"
+ "reference": "38276325bd896f90dfcfe30029aa5db40df387a7"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/composer/semver/zipball/c6bea70230ef4dd483e6bbcab6005f682ed3a8de",
- "reference": "c6bea70230ef4dd483e6bbcab6005f682ed3a8de",
+ "url": "https://api.github.com/repos/composer/semver/zipball/38276325bd896f90dfcfe30029aa5db40df387a7",
+ "reference": "38276325bd896f90dfcfe30029aa5db40df387a7",
"shasum": ""
},
"require": {
@@ -326,7 +109,21 @@
"validation",
"versioning"
],
- "time": "2020-01-13T12:06:48+00:00"
+ "funding": [
+ {
+ "url": "https://packagist.com",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/composer",
+ "type": "github"
+ },
+ {
+ "url": "https://tidelift.com/funding/github/packagist/composer/composer",
+ "type": "tidelift"
+ }
+ ],
+ "time": "2020-09-27T13:13:07+00:00"
},
{
"name": "composer/xdebug-handler",
@@ -738,20 +535,20 @@
},
{
"name": "paragonie/random_compat",
- "version": "v9.99.99",
+ "version": "v9.99.100",
"source": {
"type": "git",
"url": "https://github.com/paragonie/random_compat.git",
- "reference": "84b4dfb120c6f9b4ff7b3685f9b8f1aa365a0c95"
+ "reference": "996434e5492cb4c3edcb9168db6fbb1359ef965a"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/paragonie/random_compat/zipball/84b4dfb120c6f9b4ff7b3685f9b8f1aa365a0c95",
- "reference": "84b4dfb120c6f9b4ff7b3685f9b8f1aa365a0c95",
+ "url": "https://api.github.com/repos/paragonie/random_compat/zipball/996434e5492cb4c3edcb9168db6fbb1359ef965a",
+ "reference": "996434e5492cb4c3edcb9168db6fbb1359ef965a",
"shasum": ""
},
"require": {
- "php": "^7"
+ "php": ">= 7"
},
"require-dev": {
"phpunit/phpunit": "4.*|5.*",
@@ -779,7 +576,7 @@
"pseudorandom",
"random"
],
- "time": "2018-07-02T15:55:56+00:00"
+ "time": "2020-10-15T08:29:30+00:00"
},
{
"name": "phar-io/manifest",
@@ -885,23 +682,23 @@
},
{
"name": "php-cs-fixer/diff",
- "version": "v1.3.0",
+ "version": "v1.3.1",
"source": {
"type": "git",
"url": "https://github.com/PHP-CS-Fixer/diff.git",
- "reference": "78bb099e9c16361126c86ce82ec4405ebab8e756"
+ "reference": "dbd31aeb251639ac0b9e7e29405c1441907f5759"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/PHP-CS-Fixer/diff/zipball/78bb099e9c16361126c86ce82ec4405ebab8e756",
- "reference": "78bb099e9c16361126c86ce82ec4405ebab8e756",
+ "url": "https://api.github.com/repos/PHP-CS-Fixer/diff/zipball/dbd31aeb251639ac0b9e7e29405c1441907f5759",
+ "reference": "dbd31aeb251639ac0b9e7e29405c1441907f5759",
"shasum": ""
},
"require": {
- "php": "^5.6 || ^7.0"
+ "php": "^5.6 || ^7.0 || ^8.0"
},
"require-dev": {
- "phpunit/phpunit": "^5.7.23 || ^6.4.3",
+ "phpunit/phpunit": "^5.7.23 || ^6.4.3 || ^7.0",
"symfony/process": "^3.3"
},
"type": "library",
@@ -915,14 +712,14 @@
"BSD-3-Clause"
],
"authors": [
- {
- "name": "Kore Nordmann",
- "email": "mail@kore-nordmann.de"
- },
{
"name": "Sebastian Bergmann",
"email": "sebastian@phpunit.de"
},
+ {
+ "name": "Kore Nordmann",
+ "email": "mail@kore-nordmann.de"
+ },
{
"name": "SpacePossum"
}
@@ -932,7 +729,7 @@
"keywords": [
"diff"
],
- "time": "2018-02-15T16:58:55+00:00"
+ "time": "2020-10-14T08:39:05+00:00"
},
{
"name": "phpdocumentor/reflection-common",
@@ -985,16 +782,16 @@
},
{
"name": "phpdocumentor/reflection-docblock",
- "version": "5.2.1",
+ "version": "5.2.2",
"source": {
"type": "git",
"url": "https://github.com/phpDocumentor/ReflectionDocBlock.git",
- "reference": "d870572532cd70bc3fab58f2e23ad423c8404c44"
+ "reference": "069a785b2141f5bcf49f3e353548dc1cce6df556"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/d870572532cd70bc3fab58f2e23ad423c8404c44",
- "reference": "d870572532cd70bc3fab58f2e23ad423c8404c44",
+ "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/069a785b2141f5bcf49f3e353548dc1cce6df556",
+ "reference": "069a785b2141f5bcf49f3e353548dc1cce6df556",
"shasum": ""
},
"require": {
@@ -1033,20 +830,20 @@
}
],
"description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.",
- "time": "2020-08-15T11:14:08+00:00"
+ "time": "2020-09-03T19:13:55+00:00"
},
{
"name": "phpdocumentor/type-resolver",
- "version": "1.3.0",
+ "version": "1.4.0",
"source": {
"type": "git",
"url": "https://github.com/phpDocumentor/TypeResolver.git",
- "reference": "e878a14a65245fbe78f8080eba03b47c3b705651"
+ "reference": "6a467b8989322d92aa1c8bf2bebcc6e5c2ba55c0"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/phpDocumentor/TypeResolver/zipball/e878a14a65245fbe78f8080eba03b47c3b705651",
- "reference": "e878a14a65245fbe78f8080eba03b47c3b705651",
+ "url": "https://api.github.com/repos/phpDocumentor/TypeResolver/zipball/6a467b8989322d92aa1c8bf2bebcc6e5c2ba55c0",
+ "reference": "6a467b8989322d92aa1c8bf2bebcc6e5c2ba55c0",
"shasum": ""
},
"require": {
@@ -1078,32 +875,32 @@
}
],
"description": "A PSR-5 based resolver of Class names, Types and Structural Element Names",
- "time": "2020-06-27T10:12:23+00:00"
+ "time": "2020-09-17T18:55:26+00:00"
},
{
"name": "phpspec/prophecy",
- "version": "1.11.1",
+ "version": "1.12.1",
"source": {
"type": "git",
"url": "https://github.com/phpspec/prophecy.git",
- "reference": "b20034be5efcdab4fb60ca3a29cba2949aead160"
+ "reference": "8ce87516be71aae9b956f81906aaf0338e0d8a2d"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/phpspec/prophecy/zipball/b20034be5efcdab4fb60ca3a29cba2949aead160",
- "reference": "b20034be5efcdab4fb60ca3a29cba2949aead160",
+ "url": "https://api.github.com/repos/phpspec/prophecy/zipball/8ce87516be71aae9b956f81906aaf0338e0d8a2d",
+ "reference": "8ce87516be71aae9b956f81906aaf0338e0d8a2d",
"shasum": ""
},
"require": {
"doctrine/instantiator": "^1.2",
- "php": "^7.2",
- "phpdocumentor/reflection-docblock": "^5.0",
+ "php": "^7.2 || ~8.0, <8.1",
+ "phpdocumentor/reflection-docblock": "^5.2",
"sebastian/comparator": "^3.0 || ^4.0",
"sebastian/recursion-context": "^3.0 || ^4.0"
},
"require-dev": {
"phpspec/phpspec": "^6.0",
- "phpunit/phpunit": "^8.0"
+ "phpunit/phpunit": "^8.0 || ^9.0 <9.3"
},
"type": "library",
"extra": {
@@ -1141,7 +938,7 @@
"spy",
"stub"
],
- "time": "2020-07-08T12:44:21+00:00"
+ "time": "2020-09-29T09:10:42+00:00"
},
{
"name": "phpunit/php-code-coverage",
@@ -1393,6 +1190,7 @@
"keywords": [
"tokenizer"
],
+ "abandoned": true,
"time": "2019-09-17T06:23:10+00:00"
},
{
@@ -2287,16 +2085,16 @@
},
{
"name": "symfony/console",
- "version": "v4.4.11",
+ "version": "v4.4.15",
"source": {
"type": "git",
"url": "https://github.com/symfony/console.git",
- "reference": "55d07021da933dd0d633ffdab6f45d5b230c7e02"
+ "reference": "90933b39c7b312fc3ceaa1ddeac7eb48cb953124"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/symfony/console/zipball/55d07021da933dd0d633ffdab6f45d5b230c7e02",
- "reference": "55d07021da933dd0d633ffdab6f45d5b230c7e02",
+ "url": "https://api.github.com/repos/symfony/console/zipball/90933b39c7b312fc3ceaa1ddeac7eb48cb953124",
+ "reference": "90933b39c7b312fc3ceaa1ddeac7eb48cb953124",
"shasum": ""
},
"require": {
@@ -2360,20 +2158,34 @@
],
"description": "Symfony Console Component",
"homepage": "https://symfony.com",
- "time": "2020-07-06T13:18:39+00:00"
+ "funding": [
+ {
+ "url": "https://symfony.com/sponsor",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/fabpot",
+ "type": "github"
+ },
+ {
+ "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
+ "type": "tidelift"
+ }
+ ],
+ "time": "2020-09-15T07:58:55+00:00"
},
{
"name": "symfony/deprecation-contracts",
- "version": "v2.1.3",
+ "version": "v2.2.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/deprecation-contracts.git",
- "reference": "5e20b83385a77593259c9f8beb2c43cd03b2ac14"
+ "reference": "5fa56b4074d1ae755beb55617ddafe6f5d78f665"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/symfony/deprecation-contracts/zipball/5e20b83385a77593259c9f8beb2c43cd03b2ac14",
- "reference": "5e20b83385a77593259c9f8beb2c43cd03b2ac14",
+ "url": "https://api.github.com/repos/symfony/deprecation-contracts/zipball/5fa56b4074d1ae755beb55617ddafe6f5d78f665",
+ "reference": "5fa56b4074d1ae755beb55617ddafe6f5d78f665",
"shasum": ""
},
"require": {
@@ -2382,7 +2194,7 @@
"type": "library",
"extra": {
"branch-alias": {
- "dev-master": "2.1-dev"
+ "dev-master": "2.2-dev"
},
"thanks": {
"name": "symfony/contracts",
@@ -2410,20 +2222,34 @@
],
"description": "A generic function and convention to trigger deprecation notices",
"homepage": "https://symfony.com",
- "time": "2020-06-06T08:49:21+00:00"
+ "funding": [
+ {
+ "url": "https://symfony.com/sponsor",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/fabpot",
+ "type": "github"
+ },
+ {
+ "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
+ "type": "tidelift"
+ }
+ ],
+ "time": "2020-09-07T11:33:47+00:00"
},
{
"name": "symfony/event-dispatcher",
- "version": "v4.4.11",
+ "version": "v4.4.15",
"source": {
"type": "git",
"url": "https://github.com/symfony/event-dispatcher.git",
- "reference": "6140fc7047dafc5abbe84ba16a34a86c0b0229b8"
+ "reference": "e17bb5e0663dc725f7cdcafc932132735b4725cd"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/symfony/event-dispatcher/zipball/6140fc7047dafc5abbe84ba16a34a86c0b0229b8",
- "reference": "6140fc7047dafc5abbe84ba16a34a86c0b0229b8",
+ "url": "https://api.github.com/repos/symfony/event-dispatcher/zipball/e17bb5e0663dc725f7cdcafc932132735b4725cd",
+ "reference": "e17bb5e0663dc725f7cdcafc932132735b4725cd",
"shasum": ""
},
"require": {
@@ -2441,6 +2267,7 @@
"psr/log": "~1.0",
"symfony/config": "^3.4|^4.0|^5.0",
"symfony/dependency-injection": "^3.4|^4.0|^5.0",
+ "symfony/error-handler": "~3.4|~4.4",
"symfony/expression-language": "^3.4|^4.0|^5.0",
"symfony/http-foundation": "^3.4|^4.0|^5.0",
"symfony/service-contracts": "^1.1|^2",
@@ -2480,7 +2307,21 @@
],
"description": "Symfony EventDispatcher Component",
"homepage": "https://symfony.com",
- "time": "2020-06-18T17:59:13+00:00"
+ "funding": [
+ {
+ "url": "https://symfony.com/sponsor",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/fabpot",
+ "type": "github"
+ },
+ {
+ "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
+ "type": "tidelift"
+ }
+ ],
+ "time": "2020-09-18T14:07:46+00:00"
},
{
"name": "symfony/event-dispatcher-contracts",
@@ -2546,16 +2387,16 @@
},
{
"name": "symfony/filesystem",
- "version": "v5.1.3",
+ "version": "v5.1.7",
"source": {
"type": "git",
"url": "https://github.com/symfony/filesystem.git",
- "reference": "6e4320f06d5f2cce0d96530162491f4465179157"
+ "reference": "1a8697545a8d87b9f2f6b1d32414199cc5e20aae"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/symfony/filesystem/zipball/6e4320f06d5f2cce0d96530162491f4465179157",
- "reference": "6e4320f06d5f2cce0d96530162491f4465179157",
+ "url": "https://api.github.com/repos/symfony/filesystem/zipball/1a8697545a8d87b9f2f6b1d32414199cc5e20aae",
+ "reference": "1a8697545a8d87b9f2f6b1d32414199cc5e20aae",
"shasum": ""
},
"require": {
@@ -2592,20 +2433,34 @@
],
"description": "Symfony Filesystem Component",
"homepage": "https://symfony.com",
- "time": "2020-05-30T20:35:19+00:00"
+ "funding": [
+ {
+ "url": "https://symfony.com/sponsor",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/fabpot",
+ "type": "github"
+ },
+ {
+ "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
+ "type": "tidelift"
+ }
+ ],
+ "time": "2020-09-27T14:02:37+00:00"
},
{
"name": "symfony/finder",
- "version": "v5.1.3",
+ "version": "v5.1.7",
"source": {
"type": "git",
"url": "https://github.com/symfony/finder.git",
- "reference": "4298870062bfc667cb78d2b379be4bf5dec5f187"
+ "reference": "2c3ba7ad6884e6c4451ce2340e2dc23f6fa3e0d8"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/symfony/finder/zipball/4298870062bfc667cb78d2b379be4bf5dec5f187",
- "reference": "4298870062bfc667cb78d2b379be4bf5dec5f187",
+ "url": "https://api.github.com/repos/symfony/finder/zipball/2c3ba7ad6884e6c4451ce2340e2dc23f6fa3e0d8",
+ "reference": "2c3ba7ad6884e6c4451ce2340e2dc23f6fa3e0d8",
"shasum": ""
},
"require": {
@@ -2641,20 +2496,34 @@
],
"description": "Symfony Finder Component",
"homepage": "https://symfony.com",
- "time": "2020-05-20T17:43:50+00:00"
+ "funding": [
+ {
+ "url": "https://symfony.com/sponsor",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/fabpot",
+ "type": "github"
+ },
+ {
+ "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
+ "type": "tidelift"
+ }
+ ],
+ "time": "2020-09-02T16:23:27+00:00"
},
{
"name": "symfony/options-resolver",
- "version": "v5.1.3",
+ "version": "v5.1.7",
"source": {
"type": "git",
"url": "https://github.com/symfony/options-resolver.git",
- "reference": "9ff59517938f88d90b6e65311fef08faa640f681"
+ "reference": "4c7e155bf7d93ea4ba3824d5a14476694a5278dd"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/symfony/options-resolver/zipball/9ff59517938f88d90b6e65311fef08faa640f681",
- "reference": "9ff59517938f88d90b6e65311fef08faa640f681",
+ "url": "https://api.github.com/repos/symfony/options-resolver/zipball/4c7e155bf7d93ea4ba3824d5a14476694a5278dd",
+ "reference": "4c7e155bf7d93ea4ba3824d5a14476694a5278dd",
"shasum": ""
},
"require": {
@@ -2697,7 +2566,21 @@
"configuration",
"options"
],
- "time": "2020-07-12T12:58:00+00:00"
+ "funding": [
+ {
+ "url": "https://symfony.com/sponsor",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/fabpot",
+ "type": "github"
+ },
+ {
+ "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
+ "type": "tidelift"
+ }
+ ],
+ "time": "2020-09-27T03:44:28+00:00"
},
{
"name": "symfony/polyfill-ctype",
@@ -3076,16 +2959,16 @@
},
{
"name": "symfony/process",
- "version": "v5.1.3",
+ "version": "v5.1.7",
"source": {
"type": "git",
"url": "https://github.com/symfony/process.git",
- "reference": "1864216226af21eb76d9477f691e7cbf198e0402"
+ "reference": "d3a2e64866169586502f0cd9cab69135ad12cee9"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/symfony/process/zipball/1864216226af21eb76d9477f691e7cbf198e0402",
- "reference": "1864216226af21eb76d9477f691e7cbf198e0402",
+ "url": "https://api.github.com/repos/symfony/process/zipball/d3a2e64866169586502f0cd9cab69135ad12cee9",
+ "reference": "d3a2e64866169586502f0cd9cab69135ad12cee9",
"shasum": ""
},
"require": {
@@ -3122,20 +3005,34 @@
],
"description": "Symfony Process Component",
"homepage": "https://symfony.com",
- "time": "2020-07-23T08:36:24+00:00"
+ "funding": [
+ {
+ "url": "https://symfony.com/sponsor",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/fabpot",
+ "type": "github"
+ },
+ {
+ "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
+ "type": "tidelift"
+ }
+ ],
+ "time": "2020-09-02T16:23:27+00:00"
},
{
"name": "symfony/service-contracts",
- "version": "v2.1.3",
+ "version": "v2.2.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/service-contracts.git",
- "reference": "58c7475e5457c5492c26cc740cc0ad7464be9442"
+ "reference": "d15da7ba4957ffb8f1747218be9e1a121fd298a1"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/symfony/service-contracts/zipball/58c7475e5457c5492c26cc740cc0ad7464be9442",
- "reference": "58c7475e5457c5492c26cc740cc0ad7464be9442",
+ "url": "https://api.github.com/repos/symfony/service-contracts/zipball/d15da7ba4957ffb8f1747218be9e1a121fd298a1",
+ "reference": "d15da7ba4957ffb8f1747218be9e1a121fd298a1",
"shasum": ""
},
"require": {
@@ -3148,7 +3045,7 @@
"type": "library",
"extra": {
"branch-alias": {
- "dev-master": "2.1-dev"
+ "dev-master": "2.2-dev"
},
"thanks": {
"name": "symfony/contracts",
@@ -3184,11 +3081,25 @@
"interoperability",
"standards"
],
- "time": "2020-07-06T13:23:11+00:00"
+ "funding": [
+ {
+ "url": "https://symfony.com/sponsor",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/fabpot",
+ "type": "github"
+ },
+ {
+ "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
+ "type": "tidelift"
+ }
+ ],
+ "time": "2020-09-07T11:33:47+00:00"
},
{
"name": "symfony/stopwatch",
- "version": "v5.1.3",
+ "version": "v5.1.7",
"source": {
"type": "git",
"url": "https://github.com/symfony/stopwatch.git",
@@ -3234,6 +3145,20 @@
],
"description": "Symfony Stopwatch Component",
"homepage": "https://symfony.com",
+ "funding": [
+ {
+ "url": "https://symfony.com/sponsor",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/fabpot",
+ "type": "github"
+ },
+ {
+ "url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
+ "type": "tidelift"
+ }
+ ],
"time": "2020-05-20T17:43:50+00:00"
},
{
@@ -3372,5 +3297,6 @@
"prefer-stable": false,
"prefer-lowest": false,
"platform": [],
- "platform-dev": []
+ "platform-dev": [],
+ "plugin-api-version": "1.1.0"
}
diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
index 45aa9bd..6b1cb2d 100644
--- a/lib/AppInfo/Application.php
+++ b/lib/AppInfo/Application.php
@@ -32,20 +32,12 @@
use OCA\WorkflowOcr\OcrProcessors\OcrProcessorFactory;
use OCA\WorkflowOcr\Service\IOcrService;
use OCA\WorkflowOcr\Service\OcrService;
+use OCA\WorkflowOcr\Wrapper\CommandWrapper;
use OCA\WorkflowOcr\Wrapper\Filesystem;
-use OCA\WorkflowOcr\Wrapper\FpdiWrapper;
+use OCA\WorkflowOcr\Wrapper\ICommand;
use OCA\WorkflowOcr\Wrapper\IFilesystem;
-use OCA\WorkflowOcr\Wrapper\IFpdi;
-use OCA\WorkflowOcr\Wrapper\IImagick;
-use OCA\WorkflowOcr\Wrapper\ImagickWrapper;
-use OCA\WorkflowOcr\Wrapper\IPdfParser;
-use OCA\WorkflowOcr\Wrapper\ITesseractOcr;
use OCA\WorkflowOcr\Wrapper\IViewFactory;
-use OCA\WorkflowOcr\Wrapper\IWrapperFactory;
-use OCA\WorkflowOcr\Wrapper\PdfParserWrapper;
-use OCA\WorkflowOcr\Wrapper\TesseractOcrWrapper;
use OCA\WorkflowOcr\Wrapper\ViewFactory;
-use OCA\WorkflowOcr\Wrapper\WrapperFactory;
use OCP\AppFramework\App;
use OCP\AppFramework\Bootstrap\IBootContext;
use OCP\AppFramework\Bootstrap\IBootstrap;
@@ -69,13 +61,9 @@ public function __construct(array $urlParams = []) {
public function register(IRegistrationContext $context): void {
$context->registerServiceAlias(IOcrService::class, OcrService::class);
$context->registerServiceAlias(IOcrProcessorFactory::class, OcrProcessorFactory::class);
- $context->registerServiceAlias(IPdfParser::class, PdfParserWrapper::class);
- $context->registerServiceAlias(IImagick::class, ImagickWrapper::class);
- $context->registerServiceAlias(ITesseractOcr::class, TesseractOcrWrapper::class);
$context->registerServiceAlias(IViewFactory::class, ViewFactory::class);
- $context->registerServiceAlias(IFpdi::class, FpdiWrapper::class);
- $context->registerServiceAlias(IWrapperFactory::class, WrapperFactory::class);
$context->registerServiceAlias(IFilesystem::class, Filesystem::class);
+ $context->registerServiceAlias(ICommand::class, CommandWrapper::class);
$context->registerEventListener(RegisterOperationsEvent::class, RegisterFlowOperationsListener::class);
}
diff --git a/lib/OcrProcessors/PdfOcrProcessor.php b/lib/OcrProcessors/PdfOcrProcessor.php
index 8914fca..48a02e5 100644
--- a/lib/OcrProcessors/PdfOcrProcessor.php
+++ b/lib/OcrProcessors/PdfOcrProcessor.php
@@ -24,194 +24,27 @@
namespace OCA\WorkflowOcr\OcrProcessors;
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
-use OCA\WorkflowOcr\Wrapper\IImagick;
-use OCA\WorkflowOcr\Wrapper\IPdfParser;
-use OCA\WorkflowOcr\Wrapper\ITesseractOcr;
-use OCA\WorkflowOcr\Wrapper\IWrapperFactory;
+use OCA\WorkflowOcr\Wrapper\ICommand;
class PdfOcrProcessor implements IOcrProcessor {
- /** @var IPdfParser */
- private $pdfParser;
- /** @var ITesseractOcr */
- private $tesseract;
- /** @var IWrapperFactory */
- private $wrapperFactory;
+ /** @var ICommand */
+ private $command;
- public function __construct(IPdfParser $pdfParser, ITesseractOcr $tesseract, IWrapperFactory $wrapperFactory) {
- $this->pdfParser = $pdfParser;
- $this->tesseract = $tesseract;
- $this->wrapperFactory = $wrapperFactory;
+ public function __construct(ICommand $command) {
+ $this->command = $command;
}
public function ocrFile(string $fileContent): string {
- $pagesTextInfo = $this->getPagesTextInfo($fileContent);
-
- // Check if at least one page in PDF has no text
- $this->ensureCanOcrPdf($pagesTextInfo);
-
- // Split PDF into single pages
- $splitted = $this->splitPdf($fileContent);
-
- // OCR each single page PDF (if it does not contain text already)
- $this->ocrPages($splitted, $pagesTextInfo);
-
- // Merge results
- return $this->mergePdf($splitted);
- }
-
- /**
- * Returns an associative array (index (int) => containsText (bool)) with information, if the
- * page contains text or not. Index starts a 1.
- */
- private function getPagesTextInfo(string $pdfContent) : array {
- $pdf = $this->pdfParser->parseContent($pdfContent);
-
- $tmpCnt = 1;
- $indices = [];
- $pages = $pdf->getPages();
-
- foreach ($pages as $page) {
- $txt = $page->getText();
- $indices[$tmpCnt++] = !empty($txt) && !empty(trim($txt));
- }
-
- return $indices;
- }
-
- private function ensureCanOcrPdf(array $pagesTextInfo) : void {
- $onePageWithoutText = false;
-
- foreach ($pagesTextInfo as $idx => $containsText) {
- if (!$containsText) {
- $onePageWithoutText = true;
- break;
- }
- }
-
- if (!$onePageWithoutText) {
- throw new OcrNotPossibleException('Pdf only contains pages with text');
- }
- }
-
- /**
- * Splits PDF into associative array with 1-based index.
- */
- private function splitPdf(string $pdfContent) : array {
- try {
- $fpdiWrapper = $this->wrapperFactory->createFpdi($pdfContent);
- $pagecount = $fpdiWrapper->getPageCount();
- $splitted = [];
-
- for ($i = 1; $i <= $pagecount; $i++) {
- $onePageFpdiWrapper = $this->wrapperFactory->createFpdi($pdfContent);
- $pageId = $onePageFpdiWrapper->import($i);
- $s = $onePageFpdiWrapper->getTemplatesize($pageId);
- $onePageFpdiWrapper->AddPage($s['orientation'], $s);
- $onePageFpdiWrapper->useImportedPage($pageId);
-
- try {
- $content = $onePageFpdiWrapper->Output(null, "S");
- $splitted[$i] = $content;
- } finally {
- $onePageFpdiWrapper->Close();
- $onePageFpdiWrapper->closeStreams();
- }
- }
- } finally {
- if (isset($fpdiWrapper)) {
- $fpdiWrapper->Close();
- $fpdiWrapper->closeStreams();
- }
- }
-
- return $splitted;
- }
-
- /**
- * Process each PDF page with ocr algorithm except the pages which already
- * contain a text layer.
- */
- private function ocrPages(array &$splittedPdfPages, array $pagesTextInfo) : void {
- foreach ($splittedPdfPages as $i => $onePagePdf) {
- // Skip pages containing text
- if ($pagesTextInfo[$i] === true) {
- continue;
- }
-
- try {
- // Use Imagick to convert the pdf page to png
- $img = $this->wrapperFactory->createImagick();
- $img->setOption('density', '300');
- $img->readImageBlob($onePagePdf);
- $img->setImageFormat("png");
-
- $ocrPdf = $this->processSinglePageImagick($img);
-
- // Take original page format
- $original = $this->wrapperFactory->createFpdi($onePagePdf);
- $pageId = $original->import(1);
- $originalSize = $original->getTemplatesize($pageId);
-
- // Import single PDF page with ocr layer
- $withOcr = $this->wrapperFactory->createFpdi($ocrPdf);
- $pageIdOcr = $withOcr->import(1);
- $withOcr->AddPage($originalSize['orientation'], $originalSize);
- $withOcr->useImportedPage($pageIdOcr, 0, 0, $originalSize['width'], $originalSize['height'], false);
-
- // Overwrite original page with scanned one
- $splittedPdfPages[$i] = $withOcr->Output(null, "S");
- } finally {
- if (isset($img)) {
- $img->destroy();
- }
- if (isset($original)) {
- $original->Close();
- $original->closeStreams();
- }
- if (isset($withOcr)) {
- $withOcr->Close();
- $withOcr->closeStreams();
- }
- }
- }
- }
-
- private function processSinglePageImagick(IImagick $imagick) : string {
- $data = $imagick->getImageBlob();
- $size = $imagick->getImageLength();
-
- // Use Tesseract for ocr and converting image back to pdf
- $singlePagePdf = $this->tesseract
- ->lang(['deu', 'eng']) // TODO make configurable?
- ->imageData($data, $size)
- ->configFile('pdf')
- ->run();
-
- return $singlePagePdf;
- }
-
- /**
- * Merges single page PDF array into one output PDF.
- */
- private function mergePdf(array &$splitted) : string {
- try {
- $outputPdf = $this->wrapperFactory->createFpdi();
-
- foreach ($splitted as $i => $onePageOcrPdf) {
- $outputPdf->setContent($onePageOcrPdf);
- $pageId = $outputPdf->import(1);
- $s = $outputPdf->getTemplatesize($pageId);
- $outputPdf->AddPage($s['orientation'], $s);
- $outputPdf->useImportedPage($pageId);
- }
-
- $outputPdfContent = $outputPdf->Output(null, "S");
- return $outputPdfContent;
- } finally {
- if (isset($outputPdf)) {
- $outputPdf->Close();
- $outputPdf->closeStreams();
- }
+ $this->command
+ ->setCommand("ocrmypdf --skip-text -q - - | cat")
+ ->setStdIn($fileContent);
+
+ if ($this->command->execute()) {
+ return $this->command->getOutput();
+ } else {
+ $error = $this->command->getError();
+ $exitCode = $this->command->getExitCode();
+ throw new OcrNotPossibleException('OCRmyPDF exited abnormally with exit-code ' . $exitCode . '. Message: ' . $error);
}
}
}
diff --git a/lib/Wrapper/CommandWrapper.php b/lib/Wrapper/CommandWrapper.php
new file mode 100644
index 0000000..7eafba7
--- /dev/null
+++ b/lib/Wrapper/CommandWrapper.php
@@ -0,0 +1,79 @@
+
+ *
+ * @license GNU AGPL version 3 or any later version
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ */
+
+namespace OCA\WorkflowOcr\Wrapper;
+
+use mikehaertl\shellcommand\Command;
+
+class CommandWrapper implements ICommand {
+ /** @var Command */
+ private $command;
+
+ public function __construct() {
+ $command = new Command();
+ }
+
+ /**
+ * @inheritdoc
+ */
+ public function setCommand(string $command) : ICommand {
+ $this->command->setCommand($command);
+ return $this;
+ }
+
+ /**
+ * @inheritdoc
+ */
+ public function setStdIn(string $stdIn) : ICommand {
+ $this->command->setStdIn($stdIn);
+ return $this;
+ }
+
+ /**
+ * @inheritdoc
+ */
+ public function execute() : bool {
+ return (bool)$this->command->execute();
+ }
+
+ /**
+ * @inheritdoc
+ */
+ public function getOutput(bool $trim = true) : string {
+ return (string)$this->command->getOutput($trim);
+ }
+
+ /**
+ * @inheritdoc
+ */
+ public function getError(bool $trim = true) : string {
+ return (string)$this->command->getError($trim);
+ }
+
+ /**
+ * @inheritdoc
+ */
+ public function getExitCode() {
+ return $this->command->getExitCode();
+ }
+}
diff --git a/lib/Wrapper/FpdiWrapper.php b/lib/Wrapper/FpdiWrapper.php
deleted file mode 100644
index 54f7d83..0000000
--- a/lib/Wrapper/FpdiWrapper.php
+++ /dev/null
@@ -1,78 +0,0 @@
-
- *
- * @license GNU AGPL version 3 or any later version
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- */
-
-namespace OCA\WorkflowOcr\Wrapper;
-
-use setasign\Fpdi\Tcpdf\Fpdi;
-
-class FpdiWrapper extends Fpdi implements IFpdi {
- /** @var resource[] */
- private $streams = [];
- /** @var int */
- private $pageCount;
-
- public function __construct(string $pdfContent = '') {
- parent::__construct();
-
- if ($pdfContent !== '') {
- $this->setContent($pdfContent);
- }
-
- $this->setPrintFooter(false);
- $this->setPrintHeader(false);
- }
-
- public function setContent(string $pdfContent) : void {
- $stream = $this->createStream($pdfContent);
- $this->pageCount = $this->setSourceFile($stream);
- }
-
- public function getPageCount(): int {
- return $this->pageCount;
- }
-
- public function closeStreams() : void {
- foreach ($this->streams as $stream) {
- fclose($stream);
- }
- }
-
- public function import(int $pageNumber) : string {
- return $this->importPage($pageNumber);
- }
-
- private function createStream(string $pdfContent) {
- $stream = fopen('php://temp', 'r+');
-
- if (!$stream) {
- throw new \Exception("Could not open PDF stream");
- }
-
- fwrite($stream, $pdfContent);
- rewind($stream);
-
- $this->streams[] = $stream;
-
- return $stream;
- }
-}
diff --git a/lib/Wrapper/ICommand.php b/lib/Wrapper/ICommand.php
new file mode 100644
index 0000000..927d1ca
--- /dev/null
+++ b/lib/Wrapper/ICommand.php
@@ -0,0 +1,74 @@
+
+ *
+ * @license GNU AGPL version 3 or any later version
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ */
+
+namespace OCA\WorkflowOcr\Wrapper;
+
+/**
+ * Interface for a shell commandline.
+ */
+interface ICommand {
+ /**
+ * @param string $command the command or full command string to execute,
+ * like 'gzip' or 'gzip -d'. You can still call addArg() to add more
+ * arguments to the command. If $escapeCommand was set to true, the command
+ * gets escaped with escapeshellcmd().
+ * @return static for method chaining
+ */
+ public function setCommand(string $command) : ICommand;
+
+ /**
+ * @param string|resource $stdIn If set, the string will be piped to the
+ * command via standard input. This enables the same functionality as
+ * piping on the command line. It can also be a resource like a file
+ * handle or a stream in which case its content will be piped into the
+ * command like an input redirection.
+ * @return static for method chaining
+ */
+ public function setStdIn(string $stdIn) : ICommand;
+
+ /**
+ * Execute the command
+ *
+ * @return bool whether execution was successful. If `false`, error details
+ * can be obtained from getError(), getStdErr() and getExitCode().
+ */
+ public function execute() : bool;
+
+ /**
+ * @param bool $trim whether to `trim()` the return value. The default is `true`.
+ * @return string the command output (stdout). Empty if none.
+ */
+ public function getOutput(bool $trim = true) : string;
+
+ /**
+ * @param bool $trim whether to `trim()` the return value. The default is `true`.
+ * @return string the error message, either stderr or an internal message.
+ * Empty string if none.
+ */
+ public function getError(bool $trim = true) : string;
+
+ /**
+ * @return int|null the exit code or null if command was not executed yet
+ */
+ public function getExitCode();
+}
diff --git a/lib/Wrapper/IFpdi.php b/lib/Wrapper/IFpdi.php
deleted file mode 100644
index e591a1f..0000000
--- a/lib/Wrapper/IFpdi.php
+++ /dev/null
@@ -1,36 +0,0 @@
-
- *
- * @license GNU AGPL version 3 or any later version
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- */
-
-namespace OCA\WorkflowOcr\Wrapper;
-
-interface IFpdi {
- public function setContent(string $pdfContent) : void;
- public function getPageCount(): int;
- public function closeStreams() : void;
- public function import(int $pageNumber) : string;
- public function getTemplateSize(string $tpl);
- public function AddPage($orientation='', $format='', $keepmargins=false, $tocpage=false);
- public function useImportedPage(string $pageId, $x = 0, $y = 0, $width = null, $height = null, $adjustPageSize = false);
- public function Output($name='doc.pdf', $dest='I');
- public function Close();
-}
diff --git a/lib/Wrapper/IImagick.php b/lib/Wrapper/IImagick.php
deleted file mode 100644
index 7f8c309..0000000
--- a/lib/Wrapper/IImagick.php
+++ /dev/null
@@ -1,43 +0,0 @@
-
- *
- * @author Robin Windey
- *
- * @license GNU AGPL version 3 or any later version
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- *
- */
-
-namespace OCA\WorkflowOcr\Wrapper;
-
-use Iterator;
-
-/**
- * Interface for wrapping Imagick library
- */
-interface IImagick extends Iterator {
- public function setOption(string $key, string $value): void;
- public function readImageBlob(string $fileContent): void;
- public function setImageFormat(string $targetFormat): void;
- public function getImageBlob(): string;
- public function getImageLength(): int;
- public function getNumberImages(): int;
- public function clear(): void;
- public function destroy() : void;
-}
diff --git a/lib/Wrapper/IPdfParser.php b/lib/Wrapper/IPdfParser.php
deleted file mode 100644
index cd1cb32..0000000
--- a/lib/Wrapper/IPdfParser.php
+++ /dev/null
@@ -1,41 +0,0 @@
-
- *
- * @author Robin Windey
- *
- * @license GNU AGPL version 3 or any later version
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- *
- */
-
-namespace OCA\WorkflowOcr\Wrapper;
-
-use \Smalot\PdfParser\Document;
-
-/**
- * Interface for "wrapping" PdfParser
- */
-interface IPdfParser {
- /**
- * @param $content
- * @return Document
- * @throws \Exception
- */
- public function parseContent($pdfContent) : Document;
-}
diff --git a/lib/Wrapper/ITesseractOcr.php b/lib/Wrapper/ITesseractOcr.php
deleted file mode 100644
index 816ed7d..0000000
--- a/lib/Wrapper/ITesseractOcr.php
+++ /dev/null
@@ -1,37 +0,0 @@
-
- *
- * @author Robin Windey
- *
- * @license GNU AGPL version 3 or any later version
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- *
- */
-
-namespace OCA\WorkflowOcr\Wrapper;
-
-/**
- * Interface for wrapping Tesseract OCR library
- */
-interface ITesseractOcr {
- public function configFile(string $config) : ITesseractOcr;
- public function lang(array $langs) : ITesseractOcr;
- public function imageData(string $data, int $size) : ITesseractOcr;
- public function run() : string;
-}
diff --git a/lib/Wrapper/IWrapperFactory.php b/lib/Wrapper/IWrapperFactory.php
deleted file mode 100644
index 384b7a1..0000000
--- a/lib/Wrapper/IWrapperFactory.php
+++ /dev/null
@@ -1,29 +0,0 @@
-
- *
- * @license GNU AGPL version 3 or any later version
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- */
-
-namespace OCA\WorkflowOcr\Wrapper;
-
-interface IWrapperFactory {
- public function createFpdi(string $pdfContent = '') : IFpdi;
- public function createImagick() : IImagick;
-}
diff --git a/lib/Wrapper/ImagickWrapper.php b/lib/Wrapper/ImagickWrapper.php
deleted file mode 100644
index a961981..0000000
--- a/lib/Wrapper/ImagickWrapper.php
+++ /dev/null
@@ -1,115 +0,0 @@
-
- *
- * @author Robin Windey
- *
- * @license GNU AGPL version 3 or any later version
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- *
- */
-
-namespace OCA\WorkflowOcr\Wrapper;
-
-class ImagickWrapper implements IImagick {
- /** @var \Imagick */
- private $wrappedImagick;
-
- public function __construct() {
- $this->wrappedImagick = new \Imagick();
- }
-
- /**
- * @inheritdoc
- */
- public function setOption(string $key, string $value): void {
- $this->wrappedImagick->setOption($key, $value);
- }
-
- /**
- * @inheritdoc
- */
- public function readImageBlob(string $fileContent): void {
- $this->wrappedImagick->readImageBlob($fileContent);
- }
-
- /**
- * @inheritdoc
- */
- public function setImageFormat(string $targetFormat): void {
- $this->wrappedImagick->setImageFormat($targetFormat);
- }
-
- /**
- * @inheritdoc
- */
- public function getImageBlob(): string {
- return $this->wrappedImagick->getImageBlob();
- }
-
- /**
- * @inheritdoc
- */
- public function getImageLength(): int {
- return $this->wrappedImagick->getImageLength();
- }
-
- /**
- * @inheritdoc
- */
- public function getNumberImages(): int {
- return (int)$this->wrappedImagick->getNumberImages();
- }
-
- /**
- * @inheritdoc
- */
- public function clear(): void {
- $this->wrappedImagick->clear();
- }
-
- /**
- * @return mixed
- */
- public function current() {
- return $this->wrappedImagick->current();
- }
-
- /**
- * @return scalar
- */
- public function key() {
- return $this->wrappedImagick->key();
- }
-
- public function next() : void {
- $this->wrappedImagick->next();
- }
-
- public function rewind() : void {
- $this->wrappedImagick->rewind();
- }
-
- public function valid() : bool {
- return $this->wrappedImagick->valid();
- }
-
- public function destroy() : void {
- $this->wrappedImagick->destroy();
- }
-}
diff --git a/lib/Wrapper/PdfParserWrapper.php b/lib/Wrapper/PdfParserWrapper.php
deleted file mode 100644
index 22241d7..0000000
--- a/lib/Wrapper/PdfParserWrapper.php
+++ /dev/null
@@ -1,46 +0,0 @@
-
- *
- * @author Robin Windey
- *
- * @license GNU AGPL version 3 or any later version
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- *
- */
-
-namespace OCA\WorkflowOcr\Wrapper;
-
-use \Smalot\PdfParser\Document;
-use \Smalot\PdfParser\Parser;
-
-class PdfParserWrapper implements IPdfParser {
- /** @var Parser */
- private $wrappedParser;
-
- public function __construct() {
- $this->wrappedParser = new Parser();
- }
-
- /**
- * @inheritdoc
- */
- public function parseContent($pdfContent) : Document {
- return $this->wrappedParser->parseContent($pdfContent);
- }
-}
diff --git a/lib/Wrapper/TesseractOcrWrapper.php b/lib/Wrapper/TesseractOcrWrapper.php
deleted file mode 100644
index 1862ace..0000000
--- a/lib/Wrapper/TesseractOcrWrapper.php
+++ /dev/null
@@ -1,69 +0,0 @@
-
- *
- * @author Robin Windey
- *
- * @license GNU AGPL version 3 or any later version
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- *
- */
-
-namespace OCA\WorkflowOcr\Wrapper;
-
-use thiagoalessio\TesseractOCR\TesseractOCR;
-
-class TesseractOcrWrapper implements ITesseractOcr {
- /** @var TesseractOCR */
- private $wrappedTesseract;
-
- public function __construct() {
- $this->wrappedTesseract = new TesseractOCR();
- }
-
- /**
- * @inheritdoc
- */
- public function configFile(string $config) : ITesseractOcr {
- $this->wrappedTesseract->configFile($config);
- return $this;
- }
-
- /**
- * @inheritdoc
- */
- public function lang(array $langs) : ITesseractOcr {
- call_user_func_array([$this->wrappedTesseract, 'lang'], array_map('trim', $langs));
- return $this;
- }
-
- /**
- * @inheritdoc
- */
- public function imageData(string $data, int $size) : ITesseractOcr {
- $this->wrappedTesseract->imageData($data, $size);
- return $this;
- }
-
- /**
- * @inheritdoc
- */
- public function run() : string {
- return $this->wrappedTesseract->run();
- }
-}
diff --git a/lib/Wrapper/WrapperFactory.php b/lib/Wrapper/WrapperFactory.php
deleted file mode 100644
index 324d758..0000000
--- a/lib/Wrapper/WrapperFactory.php
+++ /dev/null
@@ -1,34 +0,0 @@
-
- *
- * @license GNU AGPL version 3 or any later version
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- */
-
-namespace OCA\WorkflowOcr\Wrapper;
-
-class WrapperFactory implements IWrapperFactory {
- public function createFpdi(string $pdfContent = ''): IFpdi {
- return new FpdiWrapper($pdfContent);
- }
-
- public function createImagick(): IImagick {
- return new ImagickWrapper();
- }
-}
diff --git a/tests/Unit/AppInfo/ApplicationTest.php b/tests/Unit/AppInfo/ApplicationTest.php
index 22c2acb..f5fe94d 100644
--- a/tests/Unit/AppInfo/ApplicationTest.php
+++ b/tests/Unit/AppInfo/ApplicationTest.php
@@ -47,9 +47,9 @@ public function testAutoloadExecutedOnBoot() {
$app->boot($bootContext);
- // PdfParser is one of the dependencies included by autoload.php
- $phpParserExists = class_exists('Smalot\PdfParser\Parser');
- $this->assertTrue($phpParserExists);
+ // 'Command' is one of the dependencies included by autoload.php
+ $commandClassExists = class_exists('mikehaertl\shellcommand\Command');
+ $this->assertTrue($commandClassExists);
}
/**
diff --git a/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php b/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php
index b14dbc3..5ee9a51 100644
--- a/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php
+++ b/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php
@@ -23,161 +23,78 @@
namespace OCA\WorkflowOcr\Tests\Unit\OcrProcessors;
-use PHPUnit\Framework\TestCase;
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\OcrProcessors\PdfOcrProcessor;
-use OCA\WorkflowOcr\Wrapper\IFpdi;
-use OCA\WorkflowOcr\Wrapper\IImagick;
-use OCA\WorkflowOcr\Wrapper\IPdfParser;
-use OCA\WorkflowOcr\Wrapper\ITesseractOcr;
-use OCA\WorkflowOcr\Wrapper\IWrapperFactory;
+use OCA\WorkflowOcr\Wrapper\ICommand;
use PHPUnit\Framework\MockObject\MockObject;
-use \Smalot\PdfParser\Document;
-use Smalot\PdfParser\Page;
+use PHPUnit\Framework\TestCase;
class PdfOcrProcessorTest extends TestCase {
- /** @var MockObject|IPdfParser */
- private $pdfParser;
- /** @var MockObject|ITesseractOcr */
- private $tesseract;
- /** @var MockObject|IWrapperFactory */
- private $wrapperFactory;
- /** @var MockObject|IFpdi */
- private $fpdi;
- /** @var MockObject|IImagick */
- private $imagick;
+ /** @var ICommand|MockObject */
+ private $command;
protected function setUp(): void {
parent::setUp();
- $this->pdfParser = $this->createMock(IPdfParser::class);
- $this->tesseract = $this->createMock(ITesseractOcr::class);
- $this->wrapperFactory = $this->createMock(IWrapperFactory::class);
- $this->fpdi = $this->createMock(IFpdi::class);
- $this->imagick = $this->createMock(IImagick::class);
- $this->wrapperFactory->method('createFpdi')
- ->withAnyParameters()
- ->willReturn($this->fpdi);
- $this->wrapperFactory->method('createImagick')
- ->with()
- ->willReturn($this->imagick);
- }
-
- public function testThrowsOcrNotPossibleException_IfPdfContainsPagesWithTextOnly() {
- /*
- Setup fake PDF document with 2 pages containing text-layer
- */
- $fakePdfDocument = $this->setUpFakePdfDocument('Page1Text', 'Page2Text');
- $this->pdfParser->expects($this->once())
- ->method('parseContent')
- ->with('someBinaryPdfContent')
- ->willReturn($fakePdfDocument);
-
- $this->expectException(OcrNotPossibleException::class);
- $pdfProcessor = new PdfOcrProcessor($this->pdfParser, $this->tesseract, $this->wrapperFactory);
- $pdfProcessor->ocrFile('someBinaryPdfContent');
- }
-
- public function testSplitPdfIsDone() {
- /*
- Setup fake PDF document with 3 pages containing no text-layers
- */
- $fakePdfDocument = $this->setUpFakePdfDocument('', '', '');
- $this->pdfParser->expects($this->once())
- ->method('parseContent')
- ->with('someBinaryPdfContent')
- ->willReturn($fakePdfDocument);
- $this->fpdi->expects($this->once())
- ->method('getPageCount')
- ->with()
- ->willReturn(3);
- $this->fpdi->method('getTemplatesize')
- ->willReturn([
- 'orientation' => 'someOrientation',
- 'width' => 50,
- 'height' => 50
- ]);
- $this->fpdi->method('Output')
- ->with(null, "S")
- ->willReturn('someBinaryPdfContentOfOnePage');
- $this->fpdi->expects($this->atLeast(3))
- ->method('import')
- ->with($this->logicalOr($this->equalTo(1), $this->equalTo(2), $this->equalTo(3)));
-
- $pdfProcessor = new PdfOcrProcessor($this->pdfParser, $this->tesseract, $this->wrapperFactory);
- $pdfProcessor->ocrFile('someBinaryPdfContent');
+ $this->command = $this->createMock(ICommand::class);
}
- public function testOcrIsCalledForEachPageWithoutText() {
- /*
- Setup fake PDF document with 3 pages. 2 without text-layer and one with text-layer.
- */
- $fakePdfDocument = $this->setUpFakePdfDocument('', 'thisPageContainsText', '');
- $this->pdfParser->expects($this->once())
- ->method('parseContent')
- ->with('someBinaryPdfContent')
- ->willReturn($fakePdfDocument);
- $this->fpdi->expects($this->once())
- ->method('getPageCount')
- ->with()
- ->willReturn(3);
- $this->fpdi->method('getTemplatesize')
- ->willReturn([
- 'orientation' => 'someOrientation',
- 'width' => 50,
- 'height' => 50
- ]);
- $this->fpdi->method('Output')
- ->with(null, "S")
- ->willReturn('someBinaryPdfContentOfOnePage');
+ public function testCallsCommandInterface() {
+ $pdfBefore = 'someFileContent';
+ $pdfAfter = 'someOcrFileContent';
+
+ $this->command->expects($this->once())
+ ->method('setCommand')
+ ->willReturn($this->command);
+ $this->command->expects($this->once())
+ ->method('setStdIn')
+ ->with($pdfBefore)
+ ->willReturn($this->command);
+ $this->command->expects($this->once())
+ ->method('execute')
+ ->willReturn(true);
+ $this->command->expects($this->once())
+ ->method('getOutput')
+ ->willReturn($pdfAfter);
- $imageBlob = 'someImageBlob';
- $imageSize = 16;
- $this->imagick->method('getImageBlob')
- ->with()
- ->willReturn($imageBlob);
- $this->imagick->method('getImageLength')
- ->with()
- ->willReturn($imageSize);
-
- // These methods are called for each page which is processed
- $this->tesseract->expects($this->exactly(2))
- ->method('lang')
- ->with(['deu', 'eng'])
- ->willReturn($this->tesseract);
- $this->tesseract->expects($this->exactly(2))
- ->method('imageData')
- ->with($imageBlob, $imageSize)
- ->willReturn($this->tesseract);
- $this->tesseract->expects($this->exactly(2))
- ->method('configFile')
- ->with('pdf')
- ->willReturn($this->tesseract);
- $this->tesseract->expects($this->exactly(2))
- ->method('run')
- ->with();
-
- $pdfProcessor = new PdfOcrProcessor($this->pdfParser, $this->tesseract, $this->wrapperFactory);
- $pdfProcessor->ocrFile('someBinaryPdfContent');
+ $processor = new PdfOcrProcessor($this->command);
+ $result = $processor->ocrFile($pdfBefore);
+
+ $this->assertEquals($pdfAfter, $result);
}
- private function setUpFakePdfDocument(...$pageTexts) : MockObject {
- $pageArray = [];
- foreach ($pageTexts as $pageText) {
- $fakePage = $this->createMock(Page::class);
- $fakePage->expects($this->once())
- ->method('getText')
- ->with()
- ->willReturn($pageText);
- $pageArray[] = $fakePage;
+ public function testThrowsOcrNotPossibleException() {
+ $pdfBefore = 'someFileContent';
+ $pdfAfter = 'someOcrFileContent';
+
+ $this->command->expects($this->once())
+ ->method('setCommand')
+ ->willReturn($this->command);
+ $this->command->expects($this->once())
+ ->method('setStdIn')
+ ->with($pdfBefore)
+ ->willReturn($this->command);
+ $this->command->expects($this->once())
+ ->method('execute')
+ ->willReturn(false);
+ $this->command->expects($this->never())
+ ->method('getOutput');
+ $this->command->expects($this->once())
+ ->method('getError');
+ $this->command->expects($this->once())
+ ->method('getExitCode');
+
+ $processor = new PdfOcrProcessor($this->command);
+ $thrown = false;
+
+ try {
+ $result = $processor->ocrFile($pdfBefore);
}
-
- $fakePdfDocument = $this->createMock(Document::class);
- $fakePdfDocument->expects($this->once())
- ->method('getPages')
- ->with()
- ->willReturn($pageArray);
-
- return $fakePdfDocument;
+ catch(\Throwable $t){
+ $thrown = true;
+ $this->assertInstanceOf(OcrNotPossibleException::class, $t);
+ }
+
+ $this->assertTrue($thrown);
}
}
From c787895098604e689cb68879630d086a1398360d Mon Sep 17 00:00:00 2001
From: Robin Windey
Date: Thu, 15 Oct 2020 18:49:21 +0200
Subject: [PATCH 2/8] Fix variable assignment
---
lib/Wrapper/CommandWrapper.php | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/Wrapper/CommandWrapper.php b/lib/Wrapper/CommandWrapper.php
index 7eafba7..a88fe37 100644
--- a/lib/Wrapper/CommandWrapper.php
+++ b/lib/Wrapper/CommandWrapper.php
@@ -30,7 +30,7 @@ class CommandWrapper implements ICommand {
private $command;
public function __construct() {
- $command = new Command();
+ $this->command = new Command();
}
/**
From efd7a3de0b667ab274bd23cee6074c75a0219b66 Mon Sep 17 00:00:00 2001
From: Robin Windey
Date: Sat, 14 Nov 2020 18:06:09 +0100
Subject: [PATCH 3/8] Use ProcessingFileAccessor to prevent infinite loop
---
appinfo/info.xml | 2 +-
lib/AppInfo/Application.php | 6 ++
lib/BackgroundJobs/ProcessFileJob.php | 41 ++++++---
lib/Helper/IProcessingFileAccessor.php | 39 ++++++++
lib/Helper/ProcessingFileAccessor.php | 62 +++++++++++++
lib/OcrProcessors/PdfOcrProcessor.php | 11 ++-
lib/Operation.php | 14 ++-
lib/Wrapper/CommandWrapper.php | 33 ++++---
lib/Wrapper/ICommand.php | 62 +++++++------
lib/Wrapper/ViewWrapper.php | 2 +-
tests/Integration/ViewWrapperTest.php | 89 +++++++++++++++++++
.../BackgroundJobs/ProcessFileJobTest.php | 67 +++++++++++---
.../Helper/ProcessingFIleAccessorTest.php | 43 +++++++++
.../OcrProcessors/PdfOcrProcessorTest.php | 3 +-
tests/Unit/OperationTest.php | 67 +++++++++++---
tests/Unit/Wrapper/CommandWrapperTest.php | 46 ++++++++++
16 files changed, 505 insertions(+), 82 deletions(-)
create mode 100644 lib/Helper/IProcessingFileAccessor.php
create mode 100644 lib/Helper/ProcessingFileAccessor.php
create mode 100644 tests/Integration/ViewWrapperTest.php
create mode 100644 tests/Unit/Helper/ProcessingFIleAccessorTest.php
create mode 100644 tests/Unit/Wrapper/CommandWrapperTest.php
diff --git a/appinfo/info.xml b/appinfo/info.xml
index a91b54d..131f5d1 100644
--- a/appinfo/info.xml
+++ b/appinfo/info.xml
@@ -6,7 +6,7 @@
OCR processing via workflowThis app makes it possible to process various files via OCR algorithms.
The processing is done via workflow-engine and can therefore easily be customized.
- 1.20.0
+ 1.20.1agplRobin WindeyWorkflowOcr
diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
index 6b1cb2d..8f28564 100644
--- a/lib/AppInfo/Application.php
+++ b/lib/AppInfo/Application.php
@@ -27,6 +27,8 @@
namespace OCA\WorkflowOcr\AppInfo;
+use OCA\WorkflowOcr\Helper\IProcessingFileAccessor;
+use OCA\WorkflowOcr\Helper\ProcessingFileAccessor;
use OCA\WorkflowOcr\Listener\RegisterFlowOperationsListener;
use OCA\WorkflowOcr\OcrProcessors\IOcrProcessorFactory;
use OCA\WorkflowOcr\OcrProcessors\OcrProcessorFactory;
@@ -65,6 +67,10 @@ public function register(IRegistrationContext $context): void {
$context->registerServiceAlias(IFilesystem::class, Filesystem::class);
$context->registerServiceAlias(ICommand::class, CommandWrapper::class);
+ $context->registerService(IProcessingFileAccessor::class, function () {
+ return ProcessingFileAccessor::getInstance();
+ });
+
$context->registerEventListener(RegisterOperationsEvent::class, RegisterFlowOperationsListener::class);
}
diff --git a/lib/BackgroundJobs/ProcessFileJob.php b/lib/BackgroundJobs/ProcessFileJob.php
index 41694a9..648ca5a 100644
--- a/lib/BackgroundJobs/ProcessFileJob.php
+++ b/lib/BackgroundJobs/ProcessFileJob.php
@@ -32,6 +32,7 @@
use \OCP\Files\File;
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Exception\OcrProcessorNotFoundException;
+use OCA\WorkflowOcr\Helper\IProcessingFileAccessor;
use OCA\WorkflowOcr\Service\IOcrService;
use OCA\WorkflowOcr\Wrapper\IFilesystem;
use OCA\WorkflowOcr\Wrapper\IViewFactory;
@@ -62,6 +63,8 @@ class ProcessFileJob extends \OC\BackgroundJob\QueuedJob {
private $userManager;
/** @var IUserSession */
private $userSession;
+ /** @var IProcessingFileAccessor */
+ private $processingFileAccessor;
public function __construct(
LoggerInterface $logger,
@@ -70,7 +73,8 @@ public function __construct(
IViewFactory $viewFactory,
IFilesystem $filesystem,
IUserManager $userManager,
- IUserSession $userSession) {
+ IUserSession $userSession,
+ IProcessingFileAccessor $processingFileAccessor) {
$this->logger = $logger;
$this->rootFolder = $rootFolder;
$this->ocrService = $ocrService;
@@ -78,6 +82,7 @@ public function __construct(
$this->filesystem = $filesystem;
$this->userManager = $userManager;
$this->userSession = $userSession;
+ $this->processingFileAccessor = $processingFileAccessor;
}
/**
@@ -153,6 +158,7 @@ private function processFile(string $filePath) : void {
$this->logger->info('Skipping process for \'' . $filePath . '\'. It is not a file');
return;
}
+
try {
$ocrFile = $this->ocrFile($node);
} catch (OcrNotPossibleException $ocrNpEx) {
@@ -163,12 +169,7 @@ private function processFile(string $filePath) : void {
return;
}
- $dirPath = dirname($filePath);
- $filename = basename($filePath);
-
- // Create new file or file-version with OCR-file
- $view = $this->viewFactory->create($dirPath);
- $view->file_put_contents($filename, $ocrFile);
+ $this->createNewFileVersion($filePath, $ocrFile, $node->getId());
}
/**
@@ -192,10 +193,30 @@ private function ocrFile(File $file) : string {
return $this->ocrService->ocrFile($file->getMimeType(), $file->getContent());
}
- /**
- * @param string $uid
- */
private function shutdownUserEnvironment() : void {
$this->userSession->setUser(null);
}
+
+ /**
+ * @param string $filePath The filepath of the file to write
+ * @param string $ocrContent The new filecontent (which was OCR processed)
+ * @param string $fileId The id of the file to write. Used for locking.
+ */
+ private function createNewFileVersion(string $filePath, string $ocrContent, int $fileId) : void {
+ $dirPath = dirname($filePath);
+ $filename = basename($filePath);
+
+ $this->processingFileAccessor->setCurrentlyProcessedFileId($fileId);
+
+ try {
+ $view = $this->viewFactory->create($dirPath);
+ // Create new file or file-version with OCR-file
+ // This will trigger 'postWrite' event which would normally
+ // add the file to the queue again but this is tackled
+ // by the processingFileAccessor.
+ $view->file_put_contents($filename, $ocrContent);
+ } finally {
+ $this->processingFileAccessor->setCurrentlyProcessedFileId(null);
+ }
+ }
}
diff --git a/lib/Helper/IProcessingFileAccessor.php b/lib/Helper/IProcessingFileAccessor.php
new file mode 100644
index 0000000..251a3a8
--- /dev/null
+++ b/lib/Helper/IProcessingFileAccessor.php
@@ -0,0 +1,39 @@
+
+ *
+ * @license GNU AGPL version 3 or any later version
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ */
+
+namespace OCA\WorkflowOcr\Helper;
+
+interface IProcessingFileAccessor {
+ /**
+ * Returns the id of the file which is currently
+ * processed via OCR
+ * @return ?int
+ */
+ public function getCurrentlyProcessedFileId() : ?int;
+
+ /**
+ * Sets the id of the file which is currently
+ * processed via OCR
+ */
+ public function setCurrentlyProcessedFileId(?int $fileId) : void;
+}
diff --git a/lib/Helper/ProcessingFileAccessor.php b/lib/Helper/ProcessingFileAccessor.php
new file mode 100644
index 0000000..573a9a6
--- /dev/null
+++ b/lib/Helper/ProcessingFileAccessor.php
@@ -0,0 +1,62 @@
+
+ *
+ * @license GNU AGPL version 3 or any later version
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ */
+
+namespace OCA\WorkflowOcr\Helper;
+
+/**
+ * This class is a singleton which holds the id
+ * of the currently OCR processed file. This ensures
+ * that a files is not added to the processing queue
+ * if the 'postWrite' hook was triggered by a new
+ * version created by the OCR process.
+ */
+class ProcessingFileAccessor implements IProcessingFileAccessor {
+ /** @var ?int */
+ private $currentlyProcessedFileId;
+
+ /** @var ProcessingFileAccessor */
+ private static $instance;
+ public static function getInstance() : ProcessingFileAccessor {
+ if (self::$instance == null) {
+ self::$instance = new ProcessingFileAccessor();
+ }
+ return self::$instance;
+ }
+
+ private function __construct() {
+ }
+
+ /**
+ * @inheritdoc
+ */
+ public function getCurrentlyProcessedFileId() : ?int {
+ return $this->currentlyProcessedFileId;
+ }
+
+ /**
+ * @inheritdoc
+ */
+ public function setCurrentlyProcessedFileId(?int $fileId) : void {
+ $this->currentlyProcessedFileId = $fileId;
+ }
+}
diff --git a/lib/OcrProcessors/PdfOcrProcessor.php b/lib/OcrProcessors/PdfOcrProcessor.php
index 48a02e5..57986a2 100644
--- a/lib/OcrProcessors/PdfOcrProcessor.php
+++ b/lib/OcrProcessors/PdfOcrProcessor.php
@@ -39,12 +39,15 @@ public function ocrFile(string $fileContent): string {
->setCommand("ocrmypdf --skip-text -q - - | cat")
->setStdIn($fileContent);
- if ($this->command->execute()) {
+ $success = $this->command->execute();
+ $errorOutput = $this->command->getError();
+ $stdErr = $this->command->getStdErr();
+ $exitCode = $this->command->getExitCode();
+
+ if ($success && $errorOutput === '' && $stdErr === '') {
return $this->command->getOutput();
} else {
- $error = $this->command->getError();
- $exitCode = $this->command->getExitCode();
- throw new OcrNotPossibleException('OCRmyPDF exited abnormally with exit-code ' . $exitCode . '. Message: ' . $error);
+ throw new OcrNotPossibleException('OCRmyPDF exited abnormally with exit-code ' . $exitCode . '. Message: ' . $errorOutput . ' ' . $stdErr);
}
}
}
diff --git a/lib/Operation.php b/lib/Operation.php
index 6b119d6..039c00d 100644
--- a/lib/Operation.php
+++ b/lib/Operation.php
@@ -36,6 +36,8 @@
use OCP\WorkflowEngine\IRuleMatcher;
use OCP\WorkflowEngine\ISpecificOperation;
use OCA\WorkflowOcr\BackgroundJobs\ProcessFileJob;
+use OCA\WorkflowOcr\Helper\IProcessingFileAccessor;
+use OCA\WorkflowOcr\Helper\SynchronizationHelper;
use OCP\Files\FileInfo;
use OCP\Files\Node;
use OCP\IURLGenerator;
@@ -51,12 +53,15 @@ class Operation implements ISpecificOperation {
private $logger;
/** @var IURLGenerator */
private $urlGenerator;
+ /** @var SynchronizationHelper */
+ private $processingFileAccessor;
- public function __construct(IJobList $jobList, IL10N $l, LoggerInterface $logger, IURLGenerator $urlGenerator) {
+ public function __construct(IJobList $jobList, IL10N $l, LoggerInterface $logger, IURLGenerator $urlGenerator, IProcessingFileAccessor $processingFileAccessor) {
$this->jobList = $jobList;
$this->l = $l;
$this->logger = $logger;
$this->urlGenerator = $urlGenerator;
+ $this->processingFileAccessor = $processingFileAccessor;
}
/**
@@ -134,6 +139,13 @@ private function checkNode(Node $node) : bool {
return false;
}
+ // Check if the event was triggered by OCR rewrite of the file
+ if ($node->getId() === $this->processingFileAccessor->getCurrentlyProcessedFileId()) {
+ $this->logger->debug('Not processing event because file with path \'{path}\' was written by OCR process.',
+ ['path' => $filePath]);
+ return false;
+ }
+
return true;
}
}
diff --git a/lib/Wrapper/CommandWrapper.php b/lib/Wrapper/CommandWrapper.php
index a88fe37..ebe1264 100644
--- a/lib/Wrapper/CommandWrapper.php
+++ b/lib/Wrapper/CommandWrapper.php
@@ -34,46 +34,53 @@ public function __construct() {
}
/**
- * @inheritdoc
- */
+ * @inheritdoc
+ */
public function setCommand(string $command) : ICommand {
$this->command->setCommand($command);
return $this;
}
/**
- * @inheritdoc
- */
+ * @inheritdoc
+ */
public function setStdIn(string $stdIn) : ICommand {
$this->command->setStdIn($stdIn);
return $this;
}
/**
- * @inheritdoc
- */
+ * @inheritdoc
+ */
public function execute() : bool {
return (bool)$this->command->execute();
}
/**
- * @inheritdoc
- */
+ * @inheritdoc
+ */
public function getOutput(bool $trim = true) : string {
return (string)$this->command->getOutput($trim);
}
/**
- * @inheritdoc
- */
+ * @inheritdoc
+ */
public function getError(bool $trim = true) : string {
return (string)$this->command->getError($trim);
}
+
+ /**
+ * @inheritdoc
+ */
+ public function getStdErr(bool $trim = true) : string {
+ return (string)$this->command->getStdErr($trim);
+ }
/**
- * @inheritdoc
- */
- public function getExitCode() {
+ * @inheritdoc
+ */
+ public function getExitCode() {
return $this->command->getExitCode();
}
}
diff --git a/lib/Wrapper/ICommand.php b/lib/Wrapper/ICommand.php
index 927d1ca..5e2fe0a 100644
--- a/lib/Wrapper/ICommand.php
+++ b/lib/Wrapper/ICommand.php
@@ -28,47 +28,53 @@
*/
interface ICommand {
/**
- * @param string $command the command or full command string to execute,
- * like 'gzip' or 'gzip -d'. You can still call addArg() to add more
- * arguments to the command. If $escapeCommand was set to true, the command
- * gets escaped with escapeshellcmd().
- * @return static for method chaining
- */
+ * @param string $command the command or full command string to execute,
+ * like 'gzip' or 'gzip -d'. You can still call addArg() to add more
+ * arguments to the command. If $escapeCommand was set to true, the command
+ * gets escaped with escapeshellcmd().
+ * @return static for method chaining
+ */
public function setCommand(string $command) : ICommand;
/**
- * @param string|resource $stdIn If set, the string will be piped to the
- * command via standard input. This enables the same functionality as
- * piping on the command line. It can also be a resource like a file
- * handle or a stream in which case its content will be piped into the
- * command like an input redirection.
- * @return static for method chaining
- */
+ * @param string|resource $stdIn If set, the string will be piped to the
+ * command via standard input. This enables the same functionality as
+ * piping on the command line. It can also be a resource like a file
+ * handle or a stream in which case its content will be piped into the
+ * command like an input redirection.
+ * @return static for method chaining
+ */
public function setStdIn(string $stdIn) : ICommand;
/**
- * Execute the command
- *
- * @return bool whether execution was successful. If `false`, error details
- * can be obtained from getError(), getStdErr() and getExitCode().
- */
+ * Execute the command
+ *
+ * @return bool whether execution was successful. If `false`, error details
+ * can be obtained from getError(), getStdErr() and getExitCode().
+ */
public function execute() : bool;
/**
- * @param bool $trim whether to `trim()` the return value. The default is `true`.
- * @return string the command output (stdout). Empty if none.
- */
+ * @param bool $trim whether to `trim()` the return value. The default is `true`.
+ * @return string the command output (stdout). Empty if none.
+ */
public function getOutput(bool $trim = true) : string;
/**
- * @param bool $trim whether to `trim()` the return value. The default is `true`.
- * @return string the error message, either stderr or an internal message.
- * Empty string if none.
- */
+ * @param bool $trim whether to `trim()` the return value. The default is `true`.
+ * @return string the error message, either stderr or an internal message.
+ * Empty string if none.
+ */
public function getError(bool $trim = true) : string;
+
+ /**
+ * @param bool $trim whether to `trim()` the return value. The default is `true`.
+ * @return string the stderr output. Empty if none.
+ */
+ public function getStdErr(bool $trim = true) : string;
/**
- * @return int|null the exit code or null if command was not executed yet
- */
- public function getExitCode();
+ * @return int|null the exit code or null if command was not executed yet
+ */
+ public function getExitCode();
}
diff --git a/lib/Wrapper/ViewWrapper.php b/lib/Wrapper/ViewWrapper.php
index 6f4a349..04be318 100644
--- a/lib/Wrapper/ViewWrapper.php
+++ b/lib/Wrapper/ViewWrapper.php
@@ -44,6 +44,6 @@ public function file_put_contents(string $filePath, string $content) : bool {
if (is_bool($retVal)) {
return $retVal;
}
- return boolval($retVal); // TODO :: method above returns numeric value (e.g. 10023)
+ return boolval($retVal);
}
}
diff --git a/tests/Integration/ViewWrapperTest.php b/tests/Integration/ViewWrapperTest.php
new file mode 100644
index 0000000..e36166c
--- /dev/null
+++ b/tests/Integration/ViewWrapperTest.php
@@ -0,0 +1,89 @@
+
+ *
+ * @license GNU AGPL version 3 or any later version
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ */
+
+namespace OCA\WorkflowOcr\Tests\Integration;
+
+use Exception;
+use OC\Files\View;
+use OCA\WorkflowOcr\Tests\TestUtils;
+use OCA\WorkflowOcr\Wrapper\ViewWrapper;
+use Test\TestCase;
+
+/**
+ * @group DB
+ */
+class ViewWrapperTest extends TestCase {
+
+ /** @var TestUtils */
+ private $testUtils;
+
+ protected function setUp() : void {
+ parent::setUp();
+ $this->testUtils = new TestUtils();
+ }
+
+ /**
+ * @dataProvider dataProvider_FilePutContents
+ */
+ public function testFilePutContents(string $filename, bool $expectedResult) {
+ $user = 'mytestuser';
+ $pw = 'myuserspw';
+ $path = '/mytestuser/files';
+ $content = 'hello world';
+
+ /** @var \OCP\IUser */
+ $userObject = null;
+
+ try {
+ $userObject = $this->testUtils->createUser($user, $pw);
+ $this->loginAsUser($user);
+
+ $viewWrapper = new ViewWrapper($path);
+
+ $result = $viewWrapper->file_put_contents($filename, $content);
+ $this->assertEquals($expectedResult, $result);
+
+ // If we expect that we can write to the file we should
+ // be able to read the file afterwards
+ if ($expectedResult) {
+ $ncView = new View($path);
+ $readContent = $ncView->file_get_contents($filename);
+ $this->assertEquals($content, $readContent);
+ }
+ } finally {
+ if ($userObject) {
+ $this->logout();
+ if (!$userObject->delete()) {
+ throw new Exception("Could not delete user " . $user);
+ }
+ }
+ }
+ }
+
+ public function dataProvider_FilePutContents() {
+ return [
+ ['testfile.txt', true],
+ ['this_is_invalid/..', false]
+ ];
+ }
+}
diff --git a/tests/Unit/BackgroundJobs/ProcessFileJobTest.php b/tests/Unit/BackgroundJobs/ProcessFileJobTest.php
index 7c49047..cd60e13 100644
--- a/tests/Unit/BackgroundJobs/ProcessFileJobTest.php
+++ b/tests/Unit/BackgroundJobs/ProcessFileJobTest.php
@@ -29,6 +29,7 @@
use OCA\WorkflowOcr\BackgroundJobs\ProcessFileJob;
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Exception\OcrProcessorNotFoundException;
+use OCA\WorkflowOcr\Helper\IProcessingFileAccessor;
use OCA\WorkflowOcr\Service\IOcrService;
use OCA\WorkflowOcr\Wrapper\IFilesystem;
use OCA\WorkflowOcr\Wrapper\IView;
@@ -68,6 +69,8 @@ class ProcessFileJobTest extends TestCase {
private $userManager;
/** @var IUser|MockObject */
private $user;
+ /** @var IProcessingFileAccessor|MockObject */
+ private $processingFileAccessor;
/** @var JobList */
private $jobList;
/** @var ProcessFileJob */
@@ -76,18 +79,13 @@ class ProcessFileJobTest extends TestCase {
public function setUp() : void {
parent::setUp();
- /** @var LoggerInterface */
$this->logger = $this->createMock(LoggerInterface::class);
- /** @var IRootFolder */
$this->rootFolder = $this->createMock(IRootFolder::class);
- /** @var IOcrService */
$this->ocrService = $this->createMock(IOcrService::class);
- /** @var IViewFactory */
$this->viewFactory = $this->createMock(IViewFactory::class);
- /** @var IFilesystem */
$this->filesystem = $this->createMock(IFilesystem::class);
- /** @var IUserSession */
$this->userSession = $this->createMock(IUserSession::class);
+ $this->processingFileAccessor = $this->createMock(IProcessingFileAccessor::class);
$userManager = $this->createMock(IUserManager::class);
$user = $this->createMock(IUser::class);
@@ -95,9 +93,7 @@ public function setUp() : void {
->withAnyParameters()
->willReturn($user);
- /** @var IUserManager */
$this->userManager = $userManager;
- /** @var IUser */
$this->user = $user;
$this->processFileJob = new ProcessFileJob(
@@ -107,7 +103,8 @@ public function setUp() : void {
$this->viewFactory,
$this->filesystem,
$this->userManager,
- $this->userSession
+ $this->userSession,
+ $this->processingFileAccessor
);
/** @var IConfig */
@@ -337,7 +334,8 @@ public function testThrowsNoUserException_OnNonExistingUser() {
$this->viewFactory,
$this->filesystem,
$userManager,
- $this->userSession
+ $this->userSession,
+ $this->processingFileAccessor
);
$arguments = ['filePath' => '/admin/files/someInvalidStuff', 'uid' => 'nonexistinguser'];
$processFileJob->setArgument($arguments);
@@ -345,6 +343,53 @@ public function testThrowsNoUserException_OnNonExistingUser() {
$processFileJob->execute($this->jobList);
}
+ /**
+ * @dataProvider dataProvider_ValidArguments
+ */
+ public function testCallsProcessingFileAccessor(array $arguments, string $user, string $rootFolderPath) {
+ $this->processFileJob->setArgument($arguments);
+ $mimeType = 'application/pdf';
+ $content = 'someFileContent';
+ $ocrContent = 'someOcrProcessedFile';
+ $filePath = $arguments['filePath'];
+ $dirPath = dirname($filePath);
+ $filename = basename($filePath);
+
+ $fileMock = $this->createValidFileMock($mimeType, $content);
+ $this->rootFolder->method('get')
+ ->with($arguments['filePath'])
+ ->willReturn($fileMock);
+
+ $this->ocrService->expects($this->once())
+ ->method('ocrFile')
+ ->willReturn($ocrContent);
+
+ $viewMock = $this->createMock(IView::class);
+ $this->viewFactory->expects($this->once())
+ ->method('create')
+ ->willReturn($viewMock);
+
+ $calledWith42 = 0;
+ $calledWithNull = 0;
+
+ $this->processingFileAccessor->expects($this->exactly(2))
+ ->method('setCurrentlyProcessedFileId')
+ ->with($this->callback(function ($id) use (&$calledWith42, &$calledWithNull) {
+ if ($id === 42) {
+ $calledWith42++;
+ } elseif ($id === null) {
+ $calledWithNull++;
+ }
+
+ return true;
+ }));
+
+ $this->processFileJob->execute($this->jobList);
+
+ $this->assertEquals(1, $calledWith42);
+ $this->assertEquals(1, $calledWithNull);
+ }
+
public function dataProvider_InvalidArguments() {
$arr = [
[null, 1],
@@ -396,6 +441,8 @@ private function createValidFileMock(string $mimeType = 'application/pdf', strin
->willReturn($mimeType);
$fileMock->method('getContent')
->willReturn($content);
+ $fileMock->method('getId')
+ ->willReturn(42);
return $fileMock;
}
}
diff --git a/tests/Unit/Helper/ProcessingFIleAccessorTest.php b/tests/Unit/Helper/ProcessingFIleAccessorTest.php
new file mode 100644
index 0000000..4d220e1
--- /dev/null
+++ b/tests/Unit/Helper/ProcessingFIleAccessorTest.php
@@ -0,0 +1,43 @@
+
+ *
+ * @license GNU AGPL version 3 or any later version
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ */
+
+namespace OCA\WorkflowOcr\Tests\Unit\Helper;
+
+use OCA\WorkflowOcr\Helper\ProcessingFileAccessor;
+use PHPUnit\Framework\TestCase;
+
+class ProcessingFileAccessorTest extends TestCase {
+ public function testSingleton() {
+ $o1 = ProcessingFileAccessor::getInstance();
+ $o2 = ProcessingFileAccessor::getInstance();
+
+ $this->assertTrue($o1 === $o2);
+ }
+
+ public function testGetSet() {
+ $o = ProcessingFileAccessor::getInstance();
+ $o ->setCurrentlyProcessedFileId(42);
+ $this->assertEquals(42, $o->getCurrentlyProcessedFileId());
+ $o->setCurrentlyProcessedFileId(null);
+ }
+}
diff --git a/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php b/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php
index 5ee9a51..0fb6c94 100644
--- a/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php
+++ b/tests/Unit/OcrProcessors/PdfOcrProcessorTest.php
@@ -89,8 +89,7 @@ public function testThrowsOcrNotPossibleException() {
try {
$result = $processor->ocrFile($pdfBefore);
- }
- catch(\Throwable $t){
+ } catch (\Throwable $t) {
$thrown = true;
$this->assertInstanceOf(OcrNotPossibleException::class, $t);
}
diff --git a/tests/Unit/OperationTest.php b/tests/Unit/OperationTest.php
index 6a3aa9e..0a3d588 100644
--- a/tests/Unit/OperationTest.php
+++ b/tests/Unit/OperationTest.php
@@ -25,6 +25,7 @@
use OCA\WorkflowEngine\Entity\File;
use OCA\WorkflowOcr\BackgroundJobs\ProcessFileJob;
+use OCA\WorkflowOcr\Helper\IProcessingFileAccessor;
use OCA\WorkflowOcr\Operation;
use OCP\BackgroundJob\IJobList;
use OCP\EventDispatcher\Event;
@@ -50,13 +51,16 @@ class OperationTest extends TestCase {
private $logger;
/** @var IURLGenerator|MockObject */
private $urlGenerator;
-
+ /** @var IProcessingFileAccessor|MockObject */
+ private $processingFileAccessor;
+
protected function setUp(): void {
parent::setUp();
$this->jobList = $this->createMock(IJobList::class);
$this->l = $this->createMock(IL10N::class);
$this->logger = $this->createMock(LoggerInterface::class);
$this->urlGenerator = $this->createMock(IURLGenerator::class);
+ $this->processingFileAccessor = $this->createMock(IProcessingFileAccessor::class);
}
/**
@@ -70,7 +74,7 @@ public function testDoesNothingOnInvalidEvent(string $eventName, Event $event) {
->method('debug')
->withAnyParameters();
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
/** @var IRuleMatcher */
$ruleMatcher = $this->createMock(IRuleMatcher::class);
$operation->onEvent($eventName, $event, $ruleMatcher);
@@ -84,7 +88,7 @@ public function testDoesNothingOnFolderEvent() {
->method('debug')
->withAnyParameters();
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
$fileMock = $this->createMock(Node::class);
$fileMock->method('getType')
@@ -97,6 +101,43 @@ public function testDoesNothingOnFolderEvent() {
$operation->onEvent($eventName, $event, $ruleMatcher);
}
+ public function testDoesNothingOnPostWriteTriggeredByCurrentOcrProcess() {
+ $this->jobList->expects($this->never())
+ ->method('add')
+ ->withAnyParameters();
+ $this->logger->expects($this->once())
+ ->method('debug')
+ ->withAnyParameters();
+
+ /** @var IProcessingFileAccessor|MockObject */
+ $processingFileAccessorMock = $this->createMock(IProcessingFileAccessor::class);
+ $processingFileAccessorMock->expects($this->once())
+ ->method('getCurrentlyProcessedFileId')
+ ->willReturn(42);
+
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $processingFileAccessorMock);
+
+ $userMock = $this->createMock(IUser::class);
+ $userMock->expects($this->never())
+ ->method('getUID');
+ $fileMock = $this->createMock(Node::class);
+ $fileMock->method('getType')
+ ->willReturn(FileInfo::TYPE_FILE);
+ $fileMock->method('getPath')
+ ->willReturn('/someuser/files/somefile.pdf');
+ $fileMock->method('getOwner')
+ ->willReturn($userMock);
+ $fileMock->method('getId')
+ ->willReturn(42);
+ $event = new GenericEvent($fileMock);
+ /** @var IRuleMatcher */
+ $ruleMatcher = $this->createMock(IRuleMatcher::class);
+ $eventName = '\OCP\Files::postCreate';
+
+ $operation->onEvent($eventName, $event, $ruleMatcher);
+ }
+
+
/**
* @dataProvider dataProvider_InvalidFilePaths
*/
@@ -108,7 +149,7 @@ public function testDoesNothingOnInvalidFilePath(string $filePath) {
->method('debug')
->withAnyParameters();
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
$fileMock = $this->createMock(Node::class);
$fileMock->method('getType')
@@ -131,7 +172,7 @@ public function testDoesNothingOnFileWithoutOwner() {
->method('debug')
->withAnyParameters();
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
$fileMock = $this->createMock(Node::class);
$fileMock->method('getType')
@@ -156,7 +197,7 @@ public function testAddWithCorrectFilePathAndUser() {
->method('add')
->with(ProcessFileJob::class, ['filePath' => $filePath, 'uid' => $uid]);
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
$userMock = $this->createMock(IUser::class);
$userMock->expects($this->once())
@@ -169,6 +210,8 @@ public function testAddWithCorrectFilePathAndUser() {
->willReturn($filePath);
$fileMock->method('getOwner')
->willReturn($userMock);
+ $fileMock->method('getId')
+ ->willReturn(42);
$event = new GenericEvent($fileMock);
/** @var IRuleMatcher */
$ruleMatcher = $this->createMock(IRuleMatcher::class);
@@ -181,7 +224,7 @@ public function testAddWithCorrectFilePathAndUser() {
* @dataProvider dataProvider_ValidScopes
*/
public function testIsAvailableForScope(int $scope) {
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
$result = $operation->isAvailableForScope($scope);
$this->assertTrue($result);
@@ -197,7 +240,7 @@ public function testDoesNothing_OnValidateOperation() {
$this->urlGenerator->expects($this->never())
->method($this->anything());
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
$operation->validateOperation('aName', [], 'aOp');
}
@@ -206,7 +249,7 @@ public function testCallsLang_OnGetDisplayName() {
$this->l->expects($this->once())
->method('t');
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
$operation->getDisplayName();
}
@@ -216,7 +259,7 @@ public function testCallsLang_OnGetDescription() {
$this->l->expects($this->once())
->method('t');
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
$operation->getDescription();
}
@@ -225,13 +268,13 @@ public function testCallsUrlGenerator_OnGetIcon() {
$this->urlGenerator->expects($this->once())
->method('imagePath');
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
$operation->getIcon();
}
public function testEntityIdIsFile() {
- $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator);
+ $operation = new Operation($this->jobList, $this->l, $this->logger, $this->urlGenerator, $this->processingFileAccessor);
$this->assertEquals(File::class, $operation->getEntityId());
}
diff --git a/tests/Unit/Wrapper/CommandWrapperTest.php b/tests/Unit/Wrapper/CommandWrapperTest.php
new file mode 100644
index 0000000..6afd012
--- /dev/null
+++ b/tests/Unit/Wrapper/CommandWrapperTest.php
@@ -0,0 +1,46 @@
+
+ *
+ * @license GNU AGPL version 3 or any later version
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ */
+
+namespace OCA\WorkflowOcr\Tests\Unit\Wrapper;
+
+use OCA\WorkflowOcr\Wrapper\CommandWrapper;
+use PHPUnit\Framework\TestCase;
+
+class CommandWrapperTest extends TestCase {
+ public function testWrappingPositiveCommand() {
+ $cmd = new CommandWrapper();
+ $cmd->setCommand('cat')
+ ->setStdIn('hello');
+ $this->assertTrue($cmd->execute());
+ $this->assertEquals('hello', $cmd->getOutput());
+ $this->assertEquals(0, $cmd->getExitCode());
+ }
+
+ public function testWrappingNegativeCommand() {
+ $cmd = new CommandWrapper();
+ $cmd->setCommand('echo hello 1>&2');
+ $cmd->execute();
+ $this->assertEquals('hello', $cmd->getStdErr());
+ $this->assertEquals('', $cmd->getError());
+ }
+}
From e3aa4becd8cbb299d8de8f28583d12f2e5e568a3 Mon Sep 17 00:00:00 2001
From: Robin Windey
Date: Sat, 14 Nov 2020 18:23:43 +0100
Subject: [PATCH 4/8] Update README for OCRmyPDF
---
README.md | 50 +++++++++++---------------------------------------
1 file changed, 11 insertions(+), 39 deletions(-)
diff --git a/README.md b/README.md
index 3650e71..3b482c2 100644
--- a/README.md
+++ b/README.md
@@ -44,46 +44,23 @@ Since the actual processing of the files is done asynchronously via Nextcloud's
### Backend
-#### Imagick
-Make sure `Imagick` is installed (the command below is for debian based Linux systems. It might be different on your system.).
-```bash
-sudo apt-get install php-imagick
-```
+> :warning: Since `v1.20.1` you'll have to install `OCRmyPDF`.
-Make sure `Imagick` is properly configured so that it can access pdf files. On debian based systems edit the configuration file `/etc/ImageMagick-6/policy.xml` (path might be different on your system). It has to contain at least this line:
-```xml
-
-
-
-
-
-
-```
-If you use **any other background job setting than [`cron`](https://docs.nextcloud.com/server/latest/admin_manual/configuration_server/background_jobs_configuration.html#cron)** you'll have to restart your php environment for the above changes to be applied. Depending on your system this is usually done by restarting your `php-fpm`-daemon or webserver, for example:
+In the backend [`OCRmyPDF`](https://github.com/jbarlow83/OCRmyPDF) is used for processing PDF files. Make sure you have this commandline tool installed.
```bash
-# Restart php-fpm
-sudo systemctl restart php7.3-fpm.service
-
-# Restart Apache webserver
-sudo systemctl restart apache2
-```
-
-You can find additional information about `Imagick` [here](https://www.php.net/manual/en/imagick.setup.php).
+apt-get install ocrmypdf
+```
-> :warning: **Note that `Imagick` requires [Ghostscript](https://www.ghostscript.com) to properly read PDF files. You can find more details in the section [Supported Image Formats](https://imagemagick.org/script/formats.php#supported) of `Imagick`'s documentation.**
+Also if you want to use specific language settings please install the corresponding `tesseract` packages.
-#### Tesseract
-For the OCR part the commandlinetool `tesseract` is used. Make sure you have the library and appropriate languages installed. I recommend installing the packages from [PPA](https://github.com/tesseract-ocr/tessdoc/blob/master/Home.md) because they're newer than the official package-sources (i tested with `tesseract 4.1.1`). On Ubuntu 18.04 you might type the following for languages english and german:
```bash
-# Install PPA
-sudo add-apt-repository ppa:alex-p/tesseract-ocr
-sudo apt-get update
+# English
+apt-get install tesseract-ocr-eng
-# Install Tesseract and language-files
-sudo apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng
+# German
+apt-get install tesseract-ocr-deu
```
-You can read more about the installation of `tesseract` [here](https://github.com/tesseract-ocr/tesseract/wiki).
## Usage
You can configure the OCR processing via Nextcloud's workflow engine. Therefore configure a new flow via `Settings -> Flow -> Add new flow` (if you don't see `OCR file` here the app isn't installed properly or you forgot to activate it).
@@ -232,11 +209,6 @@ That's all. If you now create a new workflow based on your added mimetype, your
## Used libraries & components
| Name | Version | Link |
|---|---|---|
-| tesseract_ocr | >= 2.9 | https://github.com/thiagoalessio/tesseract-ocr-for-php |
-| tesseract (commandline) | >= 4.0 | https://github.com/tesseract-ocr/tesseract |
-| pdfparser | >= 0.15.0 | https://www.pdfparser.org/ |
-| fpdi | >= 2.3 | https://www.setasign.com/products/fpdi/about/ |
-| fpdf | >= 1.8 | http://www.fpdf.org/ |
-| imagick php extension | >= 2 | https://www.php.net/manual/de/book.imagick.php |
-| Ghostscript | >= 9.0 | https://www.ghostscript.com/ |
+| OCRmyPDF (commandline) | >= 9.6.0 | https://github.com/jbarlow83/OCRmyPDF |
+| php-shellcommand | >= 1.6 | https://github.com/mikehaertl/php-shellcommand |
| PHPUnit | >= 8.0 | https://phpunit.de/ |
From 09b1667896c5d33338d45cc4939147aa3c285990 Mon Sep 17 00:00:00 2001
From: R0Wi
Date: Sat, 14 Nov 2020 17:26:11 +0000
Subject: [PATCH 5/8] docs: update TOC
---
README.md | 2 --
1 file changed, 2 deletions(-)
diff --git a/README.md b/README.md
index 3b482c2..b282499 100644
--- a/README.md
+++ b/README.md
@@ -14,8 +14,6 @@
- [App installation](#app-installation)
- [Nextcloud background jobs](#nextcloud-background-jobs)
- [Backend](#backend)
- - [Imagick](#imagick)
- - [Tesseract](#tesseract)
- [Usage](#usage)
- [How it works](#how-it-works)
- [General](#general)
From 98c06f3f7aa78b94daab5d509aa4fd44521eeec3 Mon Sep 17 00:00:00 2001
From: Robin Windey
Date: Sun, 15 Nov 2020 17:11:30 +0100
Subject: [PATCH 6/8] Update README + app compliance
---
README.md | 6 +-----
doc/diagramms/pdf.drawio | 1 -
doc/diagramms/pdf.svg | 3 ---
lib/Helper/ProcessingFileAccessor.php | 2 +-
4 files changed, 2 insertions(+), 10 deletions(-)
delete mode 100644 doc/diagramms/pdf.drawio
delete mode 100644 doc/diagramms/pdf.svg
diff --git a/README.md b/README.md
index 3b482c2..49caf3b 100644
--- a/README.md
+++ b/README.md
@@ -90,11 +90,7 @@ To **test** if your file gets processed properly you can do the following steps:
### PDF
-
-
-
-
-**Note on PDF processing:** since the processing algorithm for PDF files makes heavy use of splitting an recombining the single PDF pages, it could damage certain PDF files or manipulate the content somehow.
+For processing PDF files the external commandline tool [`OCRmyPDF`](https://github.com/jbarlow83/OCRmyPDF) is used. The tool is invoked with the [`--skip-text`](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) parameter so that it will skip pages which already contain text. Therefore it is possible to process PDF files containing both "born digital" and scanned content.
## Development
### Dev setup
diff --git a/doc/diagramms/pdf.drawio b/doc/diagramms/pdf.drawio
deleted file mode 100644
index 0d6623e..0000000
--- a/doc/diagramms/pdf.drawio
+++ /dev/null
@@ -1 +0,0 @@
-7Vtbc9o4FP41zD6F8QUbeGwgabdt0nST3bZPGQULo1a2WFkOOL9+JVu2MVIMSTAGtkxmgo7l23fOd24SHXsULN9TMJ9dEQ/ijmV4y4497liWadsO/yckSSYZWHYm8Cny5KRScIueoBQaUhojD0aViYwQzNC8KpyQMIQTVpEBSsmiOm1KcPWuc+BDRXA7AViVfkMem0mpYxjlgQ8Q+TN5a8cy8kMByKdLQTQDHlmsiOyLjj2ihLDsW7AcQSzgy5H59mfyDX/+5b7/+DX6F/x9/unu+p+z7GKXLzmleAkKQ/b6Sy9NOH9iw59fyVncWwb4xvx41pevxpIcMehxAOWQUDYjPgkBviil55TEoQfFVQ0+Kud8JmTOhSYX/oSMJdIaQMwIF81YgOXR7J7iRmtK2vB+cl5EYjqBNfOkiTJAfchqXt4tlMjtH5IAMprw8yjEgKHH6sMBaYh+Ma+Emn+RaL9AqfIhHwGO5Z1uKJnAKOLCm/GlopcSdQHhYoYYvJ2DFIcF524V4SkJmYTf5K947mMQRVJhEaPkV8EGMbswbKNQzra6eISUweUKUiqc8uhAsijJHYQcL0paDqVotkLInvF2/N0PyYgs7+zb5adr3zKeLkbW1Zlpt2H6HFSafJfnp4MfYtB18uF4uXpwnOSjJWLfy5l89GPlSHmSGCSv0uRGVrkqq7TA2lp6v4Fm6anvKAXJyoQ5QSGLVq58IwSlubl21d56zprFZBcs7ad4sjeYVO/4Tco8RJPqHaRJWYaxB5tylTgREq2ZfQYPPHmrmAbAyA/59wlXF6RcINw14tnRO3kgQJ6XWSGM0BN4SK8nFC2x4Bd3zjvOWBNSVFOo5YQSKIqcT961klTpAojRtfknO/VlilY0aVpdt2+sfKyKYs/MtaBDptOIG+h6yHmZeusYsKLdS0KFqYPJrEhv33UENC7mOJ8/8KOuL755BEb5FP6omELgJdqZHGgGUCiYBpdcoEkuZiR4iKPNiUUlVRAmcQkChAVuHyB+hMK4NLai2KEmB+G3RKHPR245uktdIKd/c2mJ41ZJ7appidnT5CWm1VhiYrUSRWQ0MFdiQRkZ9NHgtZHn9VFEC5ihSfdruLa7KPImNRsK82/nGAknxV0VEe/NCQAlwaOTKwd6xuZ6oODYfgqCVmrhffIuHd1Aijhgwg0XTrw+rr+JnJpaXDvPOShyqsX5mIR/sLR7FZeROc1hmADq1OjZt9ZyXadtdg5+s3P37Oxtyc7+QbEzLylKdn4Z/VVw8sSZaA7aDpRFP/zwqcgHe2aUsyWjhgfFKEdhlKLivHWQthluSIQYIqKmeyCMkaCqq/U2AxM6fq4KHBFMhHJCEsKMVHNxy2DpiyWpLniKKexOMIm9+wjSRzSB0T0Pu1PkxxSIh7ifotTE0p4FpBePMGtdpIRGGOc36Fh2Vu03yNbBcNDtO9XQqfK11+86KmGdpgirNhm+UOQjzr7nnGbWJ2iqlSTIHUkKNqmLfgGzVEVfzWJ0aw7NlfbDY/acZWJT5jJFy7gmsWnY4/aP0uP2f3vcxjyuqeH5nl3uYLPL1TRqOxb/M7JkNnXBloFBwjX3P/HP2jLT0Hhou7Hc1lSwPlQPfURl5nBLF52vuxyIjx4qJL6CVK625Dw+vc7s0G2xM6vdp9POTo2dZ0074KmWHjskqqlbS9GpxGqTl6a6dvI7eXoV103TMrtWNQgPW0+eTFPR7zVcdOS+OMMjkzjQNd1PIisyjcFaNtvbsvfeXNmqtocSTeA7CfjdgVlNSVXw7Yaw1zraVvLRHYYZ7Uttu2Jv7ynK1D1kXZCpZmhWZ32rTCUyTDFZTGaAsm7EX5zdW89kh89Y7Lbw1ziWtbSucOqbbNtpCGGrJYTro2wDyFv5LrjDwV5dQztN7G1l33m7uB/NMnbTMWDbvb/PbArdTwxody/eSzb7v7Y534Ka9Ui3qmd1hWyc7a7NKo5iA63asHVBINxa+BClNWPasoUgEv9FgVjT7F0gbhhxuqfoyHfl7t5rW2u7A3W7ch2N297Jrly9fTrH4rgPteFUQ/st/EOv1TCg1gLr+wEXM4Jh4TDSFo9C5702hhsgpbm2WqPJYXfVGObD8oe12Y8tyh8o2xf/AQ==
\ No newline at end of file
diff --git a/doc/diagramms/pdf.svg b/doc/diagramms/pdf.svg
deleted file mode 100644
index 3ebc3d6..0000000
--- a/doc/diagramms/pdf.svg
+++ /dev/null
@@ -1,3 +0,0 @@
-
-
-
\ No newline at end of file
diff --git a/lib/Helper/ProcessingFileAccessor.php b/lib/Helper/ProcessingFileAccessor.php
index 573a9a6..9bebce6 100644
--- a/lib/Helper/ProcessingFileAccessor.php
+++ b/lib/Helper/ProcessingFileAccessor.php
@@ -37,7 +37,7 @@ class ProcessingFileAccessor implements IProcessingFileAccessor {
/** @var ProcessingFileAccessor */
private static $instance;
public static function getInstance() : ProcessingFileAccessor {
- if (self::$instance == null) {
+ if (self::$instance === null) {
self::$instance = new ProcessingFileAccessor();
}
return self::$instance;
From 832d9b52f4fccd63f391a5542eeb8f1d26f27110 Mon Sep 17 00:00:00 2001
From: Robin Windey
Date: Sun, 15 Nov 2020 17:29:34 +0100
Subject: [PATCH 7/8] Code compliance
---
lib/Helper/ProcessingFileAccessor.php | 1 +
lib/Operation.php | 24 +++++++++++++++++-------
2 files changed, 18 insertions(+), 7 deletions(-)
diff --git a/lib/Helper/ProcessingFileAccessor.php b/lib/Helper/ProcessingFileAccessor.php
index 9bebce6..2ec1760 100644
--- a/lib/Helper/ProcessingFileAccessor.php
+++ b/lib/Helper/ProcessingFileAccessor.php
@@ -44,6 +44,7 @@ public static function getInstance() : ProcessingFileAccessor {
}
private function __construct() {
+ // Just ensuring singleton instance ...
}
/**
diff --git a/lib/Operation.php b/lib/Operation.php
index 039c00d..7ad7b64 100644
--- a/lib/Operation.php
+++ b/lib/Operation.php
@@ -105,7 +105,9 @@ public function onEvent(string $eventName, Event $event, IRuleMatcher $ruleMatch
return;
}
- if (!$this->checkNode($node)) {
+ if (!$this->pathIsValid($node) ||
+ !$this->ownerExists($node) ||
+ $this->eventTriggeredByOcrProcess($node)) {
return;
}
@@ -120,7 +122,7 @@ public function getEntityId(): string {
return File::class;
}
- private function checkNode(Node $node) : bool {
+ private function pathIsValid(Node $node) : bool {
// Check path has valid structure
$filePath = $node->getPath();
// '', admin, 'files', 'path/to/file.pdf'
@@ -131,21 +133,29 @@ private function checkNode(Node $node) : bool {
return false;
}
- // Check owner exists
+ return true;
+ }
+
+ private function ownerExists(Node $node) : bool {
+ // Check owner of file exists
$owner = $node->getOwner();
if ($owner === null) {
$this->logger->debug('Not processing event because file with path \'{path}\' has no owner.',
- ['path' => $filePath]);
+ ['path' => $node->getPath()]);
return false;
}
+ return true;
+ }
+
+ private function eventTriggeredByOcrProcess(Node $node) : bool {
// Check if the event was triggered by OCR rewrite of the file
if ($node->getId() === $this->processingFileAccessor->getCurrentlyProcessedFileId()) {
$this->logger->debug('Not processing event because file with path \'{path}\' was written by OCR process.',
- ['path' => $filePath]);
- return false;
+ ['path' => $node->getPath()]);
+ return true;
}
- return true;
+ return false;
}
}
From 4c50dd3e1290bac8ab140b855de5237161edaaef Mon Sep 17 00:00:00 2001
From: Robin Windey
Date: Mon, 30 Nov 2020 21:08:32 +0100
Subject: [PATCH 8/8] Apply suggestions from code review
Co-authored-by: Manuel Bentele
---
README.md | 2 +-
lib/OcrProcessors/PdfOcrProcessor.php | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 8247019..3cea73f 100644
--- a/README.md
+++ b/README.md
@@ -88,7 +88,7 @@ To **test** if your file gets processed properly you can do the following steps:
### PDF
-For processing PDF files the external commandline tool [`OCRmyPDF`](https://github.com/jbarlow83/OCRmyPDF) is used. The tool is invoked with the [`--skip-text`](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) parameter so that it will skip pages which already contain text. Therefore it is possible to process PDF files containing both "born digital" and scanned content.
+For processing PDF files, the external command line tool [`OCRmyPDF`](https://github.com/jbarlow83/OCRmyPDF) is used. The tool is invoked with the [`--redo-ocr`](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) parameter so that it will perform a detailed text analysis. The detailed analysis masks out visible text and sends the image of each page to the OCR processor. After processing, additional text is inserted as OCR, whereas existing text in a mixed file document (images embedded into text pages) is not disrupted.
## Development
### Dev setup
diff --git a/lib/OcrProcessors/PdfOcrProcessor.php b/lib/OcrProcessors/PdfOcrProcessor.php
index 57986a2..f377ce4 100644
--- a/lib/OcrProcessors/PdfOcrProcessor.php
+++ b/lib/OcrProcessors/PdfOcrProcessor.php
@@ -36,7 +36,7 @@ public function __construct(ICommand $command) {
public function ocrFile(string $fileContent): string {
$this->command
- ->setCommand("ocrmypdf --skip-text -q - - | cat")
+ ->setCommand("ocrmypdf --redo-ocr -q - - | cat")
->setStdIn($fileContent);
$success = $this->command->execute();