Skip to content

Commit

Permalink
Add image conversion (#111)
Browse files Browse the repository at this point in the history
* Add image conversion capability for JPEG and PNG images.

This conversion uses ImageMagick's "convert" command to create a
new PDF file. It does not OCR the file during the conversion, but
requires a separate flow for the newly created PDF file.

* Implement optional image conversion before PDF processing

* Add optional png/jpg conversion via Imagick
* Closing #107

tmp test

* Fix Psalm errors

* Use api packages from vendor
* Update Psalm baseline
* Apply Psalm autofix

* Move OCR processors to different classes

* Code- & docs cleanup

* Fix code smells & psalm errors

Co-authored-by: lbdroid <[email protected]>
  • Loading branch information
2 people authored and github-actions[bot] committed May 21, 2022
1 parent 8db58f9 commit 4d8e820
Show file tree
Hide file tree
Showing 24 changed files with 502 additions and 264 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jobs:
auth_header="$(git config --local --get http.https://github.com/.extraheader)"
git submodule sync --recursive
git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
- name: Checkout app
uses: actions/checkout@v2
with:
Expand All @@ -47,7 +48,7 @@ jobs:
with:
php-version: ${{ matrix.php-versions }}
tools: phpunit
extensions: mbstring, iconv, fileinfo, intl, sqlite, pdo_sqlite, gd, zip
extensions: mbstring, iconv, fileinfo, intl, sqlite, pdo_sqlite, gd, zip, imagick
coverage: xdebug

- name: Set up PHPUnit
Expand Down
31 changes: 11 additions & 20 deletions .github/workflows/phpunit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ env:
APP_NAME: workflow_ocr

jobs:
php:
sqlite:
runs-on: ubuntu-20.04

strategy:
Expand All @@ -37,17 +37,18 @@ jobs:
auth_header="$(git config --local --get http.https://github.com/.extraheader)"
git submodule sync --recursive
git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
- name: Checkout app
uses: actions/checkout@v2
with:
path: apps/${{ env.APP_NAME }}

- name: Set up php ${{ matrix.php-versions }}
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php-versions }}
tools: phpunit
extensions: mbstring, iconv, fileinfo, intl, sqlite, pdo_sqlite, gd, zip
extensions: mbstring, iconv, fileinfo, intl, sqlite, pdo_sqlite, gd, zip, imagick
coverage: none

- name: Set up PHPUnit
Expand All @@ -64,11 +65,7 @@ jobs:
php -S localhost:8080 &
- name: PHPUnit
working-directory: apps/${{ env.APP_NAME }}
run: ./vendor/phpunit/phpunit/phpunit -c phpunit.xml

- name: PHPUnit integration
working-directory: apps/${{ env.APP_NAME }}
run: ./vendor/phpunit/phpunit/phpunit -c phpunit.integration.xml
run: make test

mysql:
runs-on: ubuntu-20.04
Expand Down Expand Up @@ -105,6 +102,7 @@ jobs:
auth_header="$(git config --local --get http.https://github.com/.extraheader)"
git submodule sync --recursive
git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
- name: Checkout app
uses: actions/checkout@v2
with:
Expand All @@ -115,7 +113,7 @@ jobs:
with:
php-version: ${{ matrix.php-versions }}
tools: phpunit
extensions: mbstring, iconv, fileinfo, intl, mysql, pdo_mysql, gd, zip
extensions: mbstring, iconv, fileinfo, intl, mysql, pdo_mysql, gd, zip, imagick
coverage: none

- name: Set up PHPUnit
Expand All @@ -132,11 +130,7 @@ jobs:
php -S localhost:8080 &
- name: PHPUnit
working-directory: apps/${{ env.APP_NAME }}
run: ./vendor/phpunit/phpunit/phpunit -c phpunit.xml

- name: PHPUnit integration
working-directory: apps/${{ env.APP_NAME }}
run: ./vendor/phpunit/phpunit/phpunit -c phpunit.integration.xml
run: make test

pgsql:
runs-on: ubuntu-20.04
Expand Down Expand Up @@ -175,6 +169,7 @@ jobs:
auth_header="$(git config --local --get http.https://github.com/.extraheader)"
git submodule sync --recursive
git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1
- name: Checkout app
uses: actions/checkout@v2
with:
Expand All @@ -185,7 +180,7 @@ jobs:
with:
php-version: ${{ matrix.php-versions }}
tools: phpunit
extensions: mbstring, iconv, fileinfo, intl, pgsql, pdo_pgsql, gd, zip
extensions: mbstring, iconv, fileinfo, intl, pgsql, pdo_pgsql, gd, zip, imagick
coverage: none

- name: Set up PHPUnit
Expand All @@ -202,8 +197,4 @@ jobs:
php -S localhost:8080 &
- name: PHPUnit
working-directory: apps/${{ env.APP_NAME }}
run: ./vendor/phpunit/phpunit/phpunit -c phpunit.xml

- name: PHPUnit integration
working-directory: apps/${{ env.APP_NAME }}
run: ./vendor/phpunit/phpunit/phpunit -c phpunit.integration.xml
run: make test
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
- [How it works](#how-it-works)
- [General](#general)
- [PDF](#pdf)
- [Images](#images)
- [Development](#development)
- [Dev setup](#dev-setup)
- [Debugging](#debugging)
Expand Down Expand Up @@ -52,13 +53,15 @@ Since the actual processing of the files is done asynchronously via Nextcloud's
### Backend
> :warning: Since `v1.20.1` you'll have to install `OCRmyPDF`.
In the backend [`OCRmyPDF`](https://github.com/jbarlow83/OCRmyPDF) is used for processing PDF files. Make sure you have this commandline tool installed. Make sure you have the appropriate version (see below, Used libraries').
In the backend [`OCRmyPDF`](https://github.com/jbarlow83/OCRmyPDF) is used for processing PDF files. Make sure you have this commandline tool installed. Make sure you have the appropriate version (see below, Used libraries').

```bash
apt-get install ocrmypdf
```

Also if you want to use specific language settings please install the corresponding `tesseract` packages.
The `ocrmypdf` CLI can also convert single image files (`jpg`/`png`) to PDF before processing it via OCR. This mode is also supported by this app. You can read more about it in the [official docs](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html#option-use-ocrmypdf-single-images-only).

Also if you want to use specific **language settings** please install the corresponding `tesseract` packages.

```bash
# English
Expand Down Expand Up @@ -159,7 +162,11 @@ To **test** if your file gets processed properly you can do the following steps:
### PDF
For processing PDF files, the external command line tool [`OCRmyPDF`](https://github.com/jbarlow83/OCRmyPDF) is used. The tool is invoked with the [`--redo-ocr`](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped) parameter so that it will perform a detailed text analysis. The detailed analysis masks out visible text and sends the image of each page to the OCR processor. After processing, additional text is inserted as OCR, whereas existing text in a mixed file document (images embedded into text pages) is not disrupted.

### Images
For processing single images (currently `jpg` and `png` are supported), `ocrmypdf` converts the image to a PDF. The converted PDF file will then be OCR processed and saved as a new file with the original filename and the extension `.pdf` (for example `myImage.jpg` will be saved to `myImage.jpg.pdf`). The original image fill will remain untouched.

## Development

### Dev setup
Tools and packages you need for development:
* `make`
Expand Down Expand Up @@ -342,4 +349,4 @@ That's all. If you now create a new workflow based on your added mimetype, your
| OCRmyPDF (commandline) | >= 9.6.0 | https://github.com/jbarlow83/OCRmyPDF On Debian, you might need to manually install a more recent version as described in https://ocrmypdf.readthedocs.io/en/latest/installation.html#ubuntu-18-04-lts; see https://github.com/R0Wi/workflow_ocr/issues/46 |
| php-shellcommand | >= 1.6 | https://github.com/mikehaertl/php-shellcommand |
| chain | >= 0.9.0 | https://packagist.org/packages/cocur/chain |
| PHPUnit | >= 8.0 | https://phpunit.de/ |
| PHPUnit | >= 8.0 | https://phpunit.de/ |
5 changes: 3 additions & 2 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"phpunit/php-code-coverage": "9.2.5",
"phpunit/phpcov": "8.2.*",
"nextcloud/coding-standard": "^1.0",
"vimeo/psalm": "^4.22",
"vimeo/psalm": "^4.23",
"christophwurst/nextcloud": "dev-master"
},
"config": {
Expand All @@ -25,6 +25,7 @@
"psalm": "psalm --threads=1",
"psalm:update-baseline": "psalm --threads=1 --update-baseline",
"psalm:clear": "psalm --clear-cache && psalm --clear-global-cache",
"psalm:fix": "psalm --alter --issues=InvalidReturnType,InvalidNullableReturnType,MissingParamType,InvalidFalsableReturnType"
"psalm:fix": "psalm --alter --issues=InvalidReturnType,InvalidNullableReturnType,MissingParamType,InvalidFalsableReturnType",
"psalm:write-baseline": "psalm --threads=1 --set-baseline=./tests/psalm-baseline.xml"
}
}
23 changes: 12 additions & 11 deletions composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public function boot(IBootContext $context): void {
$this->requireAutoload();
}

private function requireAutoload() {
private function requireAutoload(): void {
if (is_dir(self::COMPOSER_DIR) && file_exists(self::COMPOSER_DIR . 'autoload.php')) {
require_once self::COMPOSER_DIR . 'autoload.php';
} else {
Expand Down
21 changes: 11 additions & 10 deletions lib/BackgroundJobs/ProcessFileJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ private function processFile(string $filePath, WorkflowSettings $settings) : voi
}

try {
$ocrFile = $this->ocrFile($node, $settings);
$ocrFile = $this->ocrService->ocrFile($node, $settings);
} catch (OcrNotPossibleException $ocrNpEx) {
$this->logger->error('OCR for file ' . $node->getPath() . ' not possible. Message: ' . $ocrNpEx->getMessage());
return;
Expand All @@ -179,7 +179,16 @@ private function processFile(string $filePath, WorkflowSettings $settings) : voi
return;
}

$this->createNewFileVersion($filePath, $ocrFile, $node->getId());
$fileContent = $ocrFile->getFileContent();
$nodeId = $node->getId();
$originalFileExtension = $node->getExtension();
$newFileExtension = $ocrFile->getFileExtension();

if ($originalFileExtension === $newFileExtension) {
$this->createNewFileVersion($filePath, $fileContent, $nodeId);
} else {
$this->createNewFileVersion($filePath.".pdf", $fileContent, $nodeId);
}
}

private function getNode(string $filePath) : ?Node {
Expand Down Expand Up @@ -213,14 +222,6 @@ private function initUserEnvironment(string $uid) : void {
$this->filesystem->init($uid, '/' . $uid . '/files');
}

/**
* @param File $file
* @param WorkflowSettings $settings
*/
private function ocrFile(File $file, WorkflowSettings $settings) : string {
return $this->ocrService->ocrFile($file->getMimeType(), $file->getContent(), $settings);
}

private function shutdownUserEnvironment() : void {
$this->userSession->setUser(null);
}
Expand Down
7 changes: 4 additions & 3 deletions lib/OcrProcessors/IOcrProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,16 @@
use OCA\WorkflowOcr\Exception\OcrNotPossibleException;
use OCA\WorkflowOcr\Model\GlobalSettings;
use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCP\Files\File;

interface IOcrProcessor {
/**
* Processes OCR on the given file
* @param string $fileContent The file to be processed
* @param File $file The file to be processed
* @param WorkflowSettings $settings The settings to be used for this specific workflow
* @param GlobalSettings $globalSettings The global settings configured for all OCR workflows on this system
* @return string The processed file as byte string
* @return OcrProcessorResult
* @throws OcrNotPossibleException
*/
public function ocrFile(string $fileContent, WorkflowSettings $settings, GlobalSettings $globalSettings) : string;
public function ocrFile(File $file, WorkflowSettings $settings, GlobalSettings $globalSettings) : OcrProcessorResult;
}
7 changes: 7 additions & 0 deletions lib/OcrProcessors/IOcrProcessorFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,11 @@ interface IOcrProcessorFactory {
* Creates a IOcrProcessor object for the given mimetype
*/
public function create(string $mimeType) : IOcrProcessor;

/**
* Returns true, if an OCR processor for the given mimetype
* can be constructed.
* @return bool
*/
public function canCreate(string $mimeType) : bool;
}
33 changes: 33 additions & 0 deletions lib/OcrProcessors/ImageOcrProcessor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<?php

declare(strict_types=1);

/**
* @copyright Copyright (c) 2022 Robin Windey <[email protected]>
*
* @license GNU AGPL version 3 or any later version
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

namespace OCA\WorkflowOcr\OcrProcessors;

use OCA\WorkflowOcr\Model\WorkflowSettings;
use OCA\WorkflowOcr\Model\GlobalSettings;

class ImageOcrProcessor extends OcrMyPdfBasedProcessor {
protected function getAdditionalCommandlineArgs(WorkflowSettings $settings, GlobalSettings $globalSettings): string {
return '--image-dpi 300';
}
}
Loading

0 comments on commit 4d8e820

Please sign in to comment.