From 05976021cc873399da4e965efee9ca8eb4461451 Mon Sep 17 00:00:00 2001 From: Nenad Jakic Date: Mon, 9 Sep 2024 21:43:59 +0200 Subject: [PATCH 1/4] Removed unused file. Refactored task document. --- .../nenadjakic/ocr/studio/entity/Document.kt | 7 ++-- .../ocr/studio/entity/DocumentMutableList.kt | 5 +++ .../nenadjakic/ocr/studio/entity/OcrConfig.kt | 17 +++++---- .../ocr/studio/entity/OcrProgress.kt | 14 +++----- .../ocr/studio/entity/SchedulerConfig.kt | 4 +-- .../nenadjakic/ocr/studio/entity/Task.kt | 36 +++---------------- .../extension/MultipartFileExtension.kt | 3 -- 7 files changed, 27 insertions(+), 59 deletions(-) create mode 100644 src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/DocumentMutableList.kt delete mode 100644 src/main/kotlin/com/github/nenadjakic/ocr/studio/extension/MultipartFileExtension.kt diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/Document.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/Document.kt index 48f5cdb..4947e51 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/Document.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/Document.kt @@ -1,8 +1,9 @@ package com.github.nenadjakic.ocr.studio.entity -class Document { - lateinit var originalFileName: String - lateinit var randomizedFileName: String +class Document( + val originalFileName: String, + val randomizedFileName: String +) { var type: String? = null var outDocument: OutDocument? = null } \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/DocumentMutableList.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/DocumentMutableList.kt new file mode 100644 index 0000000..4a85669 --- /dev/null +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/DocumentMutableList.kt @@ -0,0 +1,5 @@ +package com.github.nenadjakic.ocr.studio.entity + +class DocumentMutableList : ArrayList() { + var mergedDocumentName: String? = null +} \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrConfig.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrConfig.kt index 4642fc0..7bff76d 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrConfig.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrConfig.kt @@ -2,7 +2,14 @@ package com.github.nenadjakic.ocr.studio.entity import net.sourceforge.tess4j.ITesseract.RenderedFormat -class OcrConfig { +class OcrConfig( + var language: String = "eng", + var ocrEngineMode: OcrEngineMode = OcrEngineMode.DEFAULT, + var pageSegmentationMode: PageSegmentationMode = PageSegmentationMode.MODE_3, + var preProcessing: Boolean = false, + var fileFormat: FileFormat = FileFormat.TEXT, + var mergeDocuments: Boolean = false +) { enum class OcrEngineMode(val tesseractValue: Int, val descritpion: String) { LEGACY(0, "Legacy engine only."), LSTM(1, "Neural nets LSTM engine only."), @@ -41,12 +48,4 @@ class OcrConfig { TEXT -> "txt" } } - - var language: String = "eng" - var ocrEngineMode: OcrEngineMode = OcrEngineMode.DEFAULT - var pageSegmentationMode: PageSegmentationMode = PageSegmentationMode.MODE_3 - var preProcessing: Boolean = false - var fileFormat: FileFormat = FileFormat.TEXT - var mergeDocuments: Boolean = false - } \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrProgress.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrProgress.kt index 83dd43e..f6f6fb6 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrProgress.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrProgress.kt @@ -1,13 +1,7 @@ package com.github.nenadjakic.ocr.studio.entity -class OcrProgress() { - constructor(status: Status, progress: String, description: String?) : this() { - this.status = status - this.progress = progress - this.description = description - } - - var status: Status = Status.CREATED - var progress: String = "N/A" +class OcrProgress( + var status: Status = Status.CREATED, + var progress: String = "N/A", var description: String? = null -} \ No newline at end of file +) \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/SchedulerConfig.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/SchedulerConfig.kt index d2a6071..289e70b 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/SchedulerConfig.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/SchedulerConfig.kt @@ -2,6 +2,6 @@ package com.github.nenadjakic.ocr.studio.entity import java.time.ZonedDateTime -class SchedulerConfig { +class SchedulerConfig( var startDateTime: ZonedDateTime? = null -} \ No newline at end of file +) \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/Task.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/Task.kt index beaaa99..f1adb06 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/Task.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/Task.kt @@ -6,47 +6,19 @@ import org.springframework.data.mongodb.core.mapping.Field import java.util.* @Document(collection = "ocr_collection") -class Task : Auditable() { - - @Id - @Field(name = "_id") - var id: UUID? = null +class Task( + @Id @Field(name = "_id") var id: UUID? = null +) : Auditable() { lateinit var name: String var ocrConfig: OcrConfig = OcrConfig() var schedulerConfig: SchedulerConfig = SchedulerConfig() var ocrProgress: OcrProgress = OcrProgress() - var inDocuments: MutableCollection = mutableListOf() + var inDocuments: DocumentMutableList = DocumentMutableList() set(value) { inDocuments.clear() inDocuments.addAll(value) } fun addInDocument(document: com.github.nenadjakic.ocr.studio.entity.Document): Boolean = inDocuments.add(document) - - override fun equals(other: Any?): Boolean { - if (this === other) return true - if (javaClass != other?.javaClass) return false - - other as Task - - if (id != other.id) return false - if (name != other.name) return false - if (ocrConfig != other.ocrConfig) return false - if (schedulerConfig != other.schedulerConfig) return false - if (ocrProgress != other.ocrProgress) return false - if (inDocuments != other.inDocuments) return false - - return true - } - - override fun hashCode(): Int { - var result = id?.hashCode() ?: 0 - result = 31 * result + name.hashCode() - result = 31 * result + ocrConfig.hashCode() - result = 31 * result + schedulerConfig.hashCode() - result = 31 * result + ocrProgress.hashCode() - result = 31 * result + inDocuments.hashCode() - return result - } } \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/extension/MultipartFileExtension.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/extension/MultipartFileExtension.kt deleted file mode 100644 index 033a7e1..0000000 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/extension/MultipartFileExtension.kt +++ /dev/null @@ -1,3 +0,0 @@ -package com.github.nenadjakic.ocr.studio.extension - - From 0e661fa406d6e1a084553a79be5c74d8c3cdd3f0 Mon Sep 17 00:00:00 2001 From: Nenad Jakic Date: Mon, 9 Sep 2024 21:45:17 +0200 Subject: [PATCH 2/4] Added REST method for task deletion and removing files. --- .../ocr/studio/config/MessageConst.kt | 12 +++ .../ocr/studio/controller/TaskController.kt | 45 ++++++++++- .../ocr/studio/exception/OcrException.kt | 6 +- .../handler/sax/RestExceptionHandler.kt | 67 ++++++++++++++++ .../studio/service/TaskFileSystemService.kt | 11 ++- .../ocr/studio/service/TaskService.kt | 80 +++++++++++++------ .../ocr/studio/service/TaskServiceTest.kt | 9 ++- 7 files changed, 194 insertions(+), 36 deletions(-) create mode 100644 src/main/kotlin/com/github/nenadjakic/ocr/studio/config/MessageConst.kt create mode 100644 src/main/kotlin/com/github/nenadjakic/ocr/studio/handler/sax/RestExceptionHandler.kt diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/config/MessageConst.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/config/MessageConst.kt new file mode 100644 index 0000000..26b021b --- /dev/null +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/config/MessageConst.kt @@ -0,0 +1,12 @@ +package com.github.nenadjakic.ocr.studio.config + +import com.github.nenadjakic.ocr.studio.entity.Status + +enum class MessageConst(val description: String) { + ILLEGAL_STATUS("Cannot remove file for task with id: {}, because status is different than ${Status.CREATED}."), + MISSING_DOCUMENT("Cannot find task with id: {}."); + + fun formatedMessage(vararg parameters: Any): String { + return String.format(description, parameters) + } +} \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/controller/TaskController.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/controller/TaskController.kt index 59019b3..53e72e9 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/controller/TaskController.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/controller/TaskController.kt @@ -4,6 +4,8 @@ import com.github.nenadjakic.ocr.studio.dto.* import com.github.nenadjakic.ocr.studio.entity.OcrConfig import com.github.nenadjakic.ocr.studio.entity.SchedulerConfig import com.github.nenadjakic.ocr.studio.entity.Task +import com.github.nenadjakic.ocr.studio.exception.IllegalStateOcrException +import com.github.nenadjakic.ocr.studio.exception.MissingDocumentOcrException import com.github.nenadjakic.ocr.studio.extension.collectionMap import com.github.nenadjakic.ocr.studio.service.TaskService import io.swagger.v3.oas.annotations.Operation @@ -88,9 +90,7 @@ open class TaskController( ) @PostMapping(consumes = [MediaType.MULTIPART_FORM_DATA_VALUE]) fun create( - @Valid @RequestPart(name = "model") - //@Schema(implementation = TaskAddRequest::class) - model: TaskAddRequest, + @Valid @RequestPart(name = "model") model: TaskAddRequest, @RequestPart(value = "files", required = false) files: Collection? ): ResponseEntity { val task = modelMapper.map(model, Task::class.java) @@ -184,6 +184,45 @@ open class TaskController( @RequestPart("files") multipartFiles: Collection ): ResponseEntity> = ResponseEntity.ok(modelMapper.collectionMap(taskService.upload(id, multipartFiles), UploadDocumentResponse::class.java)) + @Operation( + operationId = "removeFile", + summary = "Remove file or all files and document from task.", + description = "Remove file or all files (in case that param originalFileName is not give) and document from task." + ) + @ApiResponses( + value = [ + ApiResponse(responseCode = "204", description = "File removed from file system successfully. Also document task updated successfully."), + ApiResponse(responseCode = "400", description = "Invalid request data.") + ] + ) + @DeleteMapping("/file/{id}") + @Throws(MissingDocumentOcrException::class) + fun removeFile(@PathVariable id: UUID, @RequestParam(required = false) originalFileName: String): ResponseEntity { + if (originalFileName.isEmpty()) { + taskService.removeAllFiles(id) + } else { + taskService.removeFile(id, originalFileName) + } + return ResponseEntity.noContent().build() + } + + @Operation( + operationId = "deleteTask", + summary = "Delete task and remove all files.", + description = "Delete task and remove all files." + ) + @ApiResponses( + value = [ + ApiResponse(responseCode = "204", description = "Task deleted and all files removed from file system successfully."), + ApiResponse(responseCode = "400", description = "Invalid request data.") + ] + ) + @DeleteMapping("/{id}") + @Throws(MissingDocumentOcrException::class, IllegalStateOcrException::class) + fun deleteById (@PathVariable id: UUID) { + taskService.deleteById(id) + } + private fun insert(task: Task, files: Collection? = null): ResponseEntity { val createdTask = taskService.insert(task, files) diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/exception/OcrException.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/exception/OcrException.kt index 0dfe646..5c17680 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/exception/OcrException.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/exception/OcrException.kt @@ -1,3 +1,7 @@ package com.github.nenadjakic.ocr.studio.exception -class OcrException(message: String) : Exception(message) \ No newline at end of file +open class OcrException(message: String) : Exception(message) + +class IllegalStateOcrException(message: String): OcrException(message) + +class MissingDocumentOcrException(message: String) : OcrException(message) diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/handler/sax/RestExceptionHandler.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/handler/sax/RestExceptionHandler.kt new file mode 100644 index 0000000..6d55ad5 --- /dev/null +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/handler/sax/RestExceptionHandler.kt @@ -0,0 +1,67 @@ +package com.github.nenadjakic.ocr.studio.handler.sax + +import com.github.nenadjakic.ocr.studio.exception.IllegalStateOcrException +import com.github.nenadjakic.ocr.studio.exception.MissingDocumentOcrException +import jakarta.validation.ConstraintViolation +import jakarta.validation.ConstraintViolationException +import org.springframework.http.HttpStatus +import org.springframework.http.ResponseEntity +import org.springframework.web.bind.annotation.ControllerAdvice +import org.springframework.web.bind.annotation.ExceptionHandler +import org.springframework.web.bind.annotation.ResponseBody +import org.springframework.web.bind.annotation.ResponseStatus +import org.springframework.web.context.request.ServletWebRequest +import org.springframework.web.context.request.WebRequest +import org.springframework.web.servlet.mvc.method.annotation.ResponseEntityExceptionHandler +import java.time.LocalDateTime +import java.util.stream.Collectors + +@ControllerAdvice +@ResponseBody +class RestExceptionHandler : ResponseEntityExceptionHandler() { + + data class ErrorInfo( + var status: HttpStatus, + var errors: List, + var path: String, + val timestamp: LocalDateTime = LocalDateTime.now() + ) + + @ExceptionHandler(MissingDocumentOcrException::class) + @ResponseStatus(HttpStatus.NOT_FOUND) + fun handleException(ex: MissingDocumentOcrException, request: WebRequest?): ResponseEntity { + logger.error("Error occurred.", ex) + return getErrorInfoResponseEntity(HttpStatus.NOT_FOUND, ex, request as ServletWebRequest) + } + + @ExceptionHandler(IllegalStateOcrException::class) + @ResponseStatus(HttpStatus.BAD_REQUEST) + fun handleException(ex: IllegalStateOcrException, request: WebRequest?): ResponseEntity { + logger.error("Error occurred.", ex) + return getErrorInfoResponseEntity(HttpStatus.BAD_REQUEST, ex, request as ServletWebRequest) + } + + private fun getErrorInfoResponseEntity( + resultHttpStatus: HttpStatus, + ex: Exception, + request: ServletWebRequest + ): ResponseEntity { + val path = request.request.requestURI + + logger.error("Exception occurred. in request: $path", ex) + val body: ErrorInfo = if (ex is ConstraintViolationException && ex.constraintViolations.isNotEmpty()) { + getErrorResponse(resultHttpStatus, + ex.constraintViolations.stream() + .map { obj: ConstraintViolation<*> -> obj.message } + .collect(Collectors.toList()), path) + } else { + getErrorResponse(resultHttpStatus, ex.message, path) + } + + return ResponseEntity(body, resultHttpStatus) + } + + private fun getErrorResponse(resultHttpStatus: HttpStatus, message: String?, path: String): ErrorInfo = getErrorResponse(resultHttpStatus, mutableListOf(message), path) + + private fun getErrorResponse(resultHttpStatus: HttpStatus, messages: List, path: String): ErrorInfo = ErrorInfo(resultHttpStatus, messages, path) +} \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskFileSystemService.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskFileSystemService.kt index 8ffff6e..2fe9970 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskFileSystemService.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskFileSystemService.kt @@ -39,6 +39,10 @@ class TaskFileSystemService( return contentType } + fun getInputFile(taskPath: String, taskId: UUID, randomizedFileName: String): File = Path.of(taskPath, taskId.toString(), "input", randomizedFileName).toFile() + + fun getOutputFile(taskPath: String, taskId: UUID, randomizedFileName: String): File = Path.of(taskPath, taskId.toString(), "output", randomizedFileName).toFile() + private fun cloneInputStream (inputStream: InputStream): InputStream { val byteArrayOutputStream = ByteArrayOutputStream() inputStream.transferTo(byteArrayOutputStream) @@ -71,14 +75,13 @@ class TaskFileSystemService( multiPartFile.transferTo(targetFile.absoluteFile) } + fun deleteFile(file: Path) { + Files.delete(file) + } fun cleanUp(id: UUID) { deleteDirectoryRecursively(Path.of(ocrProperties.taskPath)) } - fun getInputFiles(id:UUID) {} - - fun getOutputFiles(id:UUID) {} - @Throws(IOException::class) private fun deleteDirectoryRecursively(path: Path) { Files.walk(path) diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskService.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskService.kt index 3bead72..9831552 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskService.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskService.kt @@ -1,10 +1,10 @@ package com.github.nenadjakic.ocr.studio.service -import com.github.nenadjakic.ocr.studio.entity.Document -import com.github.nenadjakic.ocr.studio.entity.OcrConfig -import com.github.nenadjakic.ocr.studio.entity.SchedulerConfig -import com.github.nenadjakic.ocr.studio.entity.Task -import com.github.nenadjakic.ocr.studio.exception.OcrException +import com.github.nenadjakic.ocr.studio.config.MessageConst +import com.github.nenadjakic.ocr.studio.config.OcrProperties +import com.github.nenadjakic.ocr.studio.entity.* +import com.github.nenadjakic.ocr.studio.exception.IllegalStateOcrException +import com.github.nenadjakic.ocr.studio.exception.MissingDocumentOcrException import com.github.nenadjakic.ocr.studio.repository.TaskRepository import org.springframework.data.domain.Page import org.springframework.data.domain.PageRequest @@ -16,7 +16,8 @@ import java.util.* @Service class TaskService( private val taskRepository: TaskRepository, - private val taskFileSystemService: TaskFileSystemService + private val taskFileSystemService: TaskFileSystemService, + private val ocrProperties: OcrProperties ) { fun findAll(): List = taskRepository.findAll(Sort.by(Sort.Order.asc("id"))) @@ -25,16 +26,16 @@ class TaskService( fun findPage(pageNumber: Int, pageSize: Int): Page = taskRepository.findAll(PageRequest.of(pageNumber, pageSize, Sort.by(Sort.Order.asc("id")))) - private fun insert(entity: Task): Task { - entity.id = UUID.randomUUID() + private fun insert(task: Task): Task { + task.id = UUID.randomUUID() - taskFileSystemService.createTaskDirectories(entity.id!!) + taskFileSystemService.createTaskDirectories(task.id!!) - return taskRepository.insert(entity) + return taskRepository.insert(task) } - fun insert(entity: Task, files: Collection? = emptyList()): Task { - val createdEntity = insert(entity) + fun insert(task: Task, files: Collection? = emptyList()): Task { + val createdEntity = insert(task) if (!files.isNullOrEmpty()) { upload(createdEntity.id!!, files) } @@ -43,19 +44,28 @@ class TaskService( fun update(entity: Task): Task = taskRepository.save(entity) - fun delete(entity: Task) = taskRepository.delete(entity) + fun delete(task: Task) { + if (task.ocrProgress.status != Status.CREATED) { + throw IllegalStateOcrException(MessageConst.ILLEGAL_STATUS.formatedMessage(task.id!!)) + } + + removeAllFiles(task) + taskRepository.delete(task) + } - fun deleteById(id: UUID) = taskRepository.deleteById(id) + fun deleteById(id: UUID) { + val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.formatedMessage(id)) } + delete(task) + } fun upload(id: UUID, multipartFiles: Collection): List { val createdDocuments = mutableListOf() - val task = taskRepository.findById(id).orElseThrow { OcrException("Cannot find task with id: $id.") } + val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.formatedMessage(id)) } for (multiPartFile in multipartFiles) { - val document = Document() - document.originalFileName = multiPartFile.originalFilename!! - document.randomizedFileName = UUID.randomUUID().toString() - document.type = TaskFileSystemService.getContentType(multiPartFile) + val document = Document(multiPartFile.originalFilename!!, UUID.randomUUID().toString()).apply { + type = TaskFileSystemService.getContentType(multiPartFile) + } taskFileSystemService.uploadFile(multiPartFile, id, document.randomizedFileName) task.addInDocument(document) @@ -66,15 +76,37 @@ class TaskService( return createdDocuments } - fun removeFiles(id: UUID, originalFileName: String) {} + fun removeFile(id: UUID, originalFileName: String) { + val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.formatedMessage(id)) } - fun update(id: UUID, properties: Map) { - val optTask = taskRepository.findById(id) + if (task.ocrProgress.status != Status.CREATED) { + throw IllegalStateOcrException(MessageConst.ILLEGAL_STATUS.formatedMessage(id)) + } - if (optTask.isPresent) { - val task = optTask.get() + task.inDocuments.find { it.originalFileName == originalFileName }?.let { + taskFileSystemService.deleteFile(TaskFileSystemService.getInputFile(ocrProperties.taskPath, id, it.randomizedFileName).toPath()) + task.inDocuments.remove(it) + } + taskRepository.save(task) + } + fun removeAllFiles(task: Task) { + if (task.ocrProgress.status != Status.CREATED) { + throw IllegalStateOcrException(MessageConst.ILLEGAL_STATUS.formatedMessage(task.id!!)) } + task.inDocuments.forEach { taskFileSystemService.deleteFile(TaskFileSystemService.getInputFile(ocrProperties.taskPath, task.id!!, it.randomizedFileName).toPath()) } + task.inDocuments.clear() + taskRepository.save(task) + } + + fun removeAllFiles(id: UUID) { + val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.formatedMessage(id)) } + + removeAllFiles(task) + } + + fun update(id: UUID, properties: Map) { + TODO() } fun update(id: UUID, language: String): Int = taskRepository.updateLanguageById(id, language) diff --git a/src/test/kotlin/com/github/nenadjakic/ocr/studio/service/TaskServiceTest.kt b/src/test/kotlin/com/github/nenadjakic/ocr/studio/service/TaskServiceTest.kt index 95181c0..4945ea7 100644 --- a/src/test/kotlin/com/github/nenadjakic/ocr/studio/service/TaskServiceTest.kt +++ b/src/test/kotlin/com/github/nenadjakic/ocr/studio/service/TaskServiceTest.kt @@ -4,11 +4,10 @@ import com.github.nenadjakic.ocr.studio.config.OcrProperties import com.github.nenadjakic.ocr.studio.entity.Task import com.github.nenadjakic.ocr.studio.repository.TaskRepository import org.junit.jupiter.api.AfterEach -import org.junit.jupiter.api.BeforeEach -import org.junit.jupiter.api.Test - import org.junit.jupiter.api.Assertions.* +import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.DisplayName +import org.junit.jupiter.api.Test import org.junit.jupiter.api.extension.ExtendWith import org.mockito.Mockito.* import org.mockito.junit.jupiter.MockitoExtension @@ -26,13 +25,15 @@ class TaskServiceTest { private lateinit var taskService: TaskService private lateinit var taskRepository: TaskRepository private lateinit var taskFileSystemService: TaskFileSystemService + private lateinit var ocrProperties: OcrProperties @BeforeEach fun setUp() { taskRepository = mock(TaskRepository::class.java) taskFileSystemService = mock(TaskFileSystemService::class.java) + ocrProperties = mock(OcrProperties::class.java) - taskService = TaskService(taskRepository, taskFileSystemService) + taskService = TaskService(taskRepository, taskFileSystemService, ocrProperties) } @AfterEach From 7ee745a95cce15b2d267463db2f232251b4db1da Mon Sep 17 00:00:00 2001 From: Nenad Jakic Date: Mon, 9 Sep 2024 21:45:33 +0200 Subject: [PATCH 3/4] Code clean up. --- .../ocr/studio/executor/OcrExecutor.kt | 25 +++++++------------ .../studio/extension/ModelMapperExtension.kt | 1 - .../ocr/studio/handler/sax/HocrSaxHandler.kt | 4 +-- .../ocr/studio/service/OcrService.kt | 13 ++++++++++ 4 files changed, 24 insertions(+), 19 deletions(-) diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/OcrExecutor.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/OcrExecutor.kt index 7aff965..69f8de1 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/OcrExecutor.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/OcrExecutor.kt @@ -44,12 +44,9 @@ class OcrExecutor( try { progressInfo.description = "Starting ocr of documents..." for (document in task.inDocuments.sortedBy { it.originalFileName }) { - val inFile = - Path.of(ocrProperties.taskPath, task.id.toString(), "input", document.randomizedFileName).toFile() + val inFile = TaskFileSystemService.getInputFile(ocrProperties.taskPath, task.id!!, document.randomizedFileName) if (inFile.exists()) { - val outFile = - Path.of(ocrProperties.taskPath, task.id.toString(), "output", UUID.randomUUID().toString()) - .toFile() + val outFile = TaskFileSystemService.getOutputFile(ocrProperties.taskPath, task.id!!, UUID.randomUUID().toString()) document.outDocument = OutDocument() document.outDocument!!.outputFileName = outFile.name @@ -97,9 +94,10 @@ class OcrExecutor( } if (task.ocrConfig.mergeDocuments) { progressInfo.description = "Starting merging of documents..." - val mergedFile = - Path.of(ocrProperties.taskPath, task.id.toString(), "output", "merged_" + UUID.randomUUID() + "." + task.ocrConfig.fileFormat.getExtension()) - .toFile() + val mergedFileName = "merged_" + UUID.randomUUID() + "." + task.ocrConfig.fileFormat.getExtension() + task.inDocuments.mergedDocumentName = mergedFileName + + val mergedFile = TaskFileSystemService.getOutputFile(ocrProperties.taskPath, task.id!!, mergedFileName) when (task.ocrConfig.fileFormat) { OcrConfig.FileFormat.TEXT -> { @@ -144,9 +142,9 @@ class OcrExecutor( val saxHandler = HocrSaxHandler() BufferedWriter(FileWriter(mergedFile)).use { writer -> - //writer.write("") - //writer.newLine() - // writer.write("") + writer.write("") + writer.newLine() + writer.write("") writer.write("") writer.newLine() for ((index, document) in task.inDocuments.sortedBy { it.originalFileName }.withIndex()) { @@ -183,11 +181,6 @@ class OcrExecutor( } } - private data class InputData ( - val fileFormat: OcrConfig.FileFormat, - val file: File - ) - @Throws(IOException::class) private fun preProcessDocument(preProcess: Boolean, inFile: File): Map { val files = mutableMapOf() diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/extension/ModelMapperExtension.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/extension/ModelMapperExtension.kt index afec8b9..d916c19 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/extension/ModelMapperExtension.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/extension/ModelMapperExtension.kt @@ -5,7 +5,6 @@ import java.util.function.Consumer private fun map(modelMapper: ModelMapper, source: S, type: Class?): T = modelMapper.map(source, type) - fun ModelMapper.collectionMap(source: List?, type: Class?): List { val result: MutableList = ArrayList() source?.forEach(Consumer { result.add(map(this, it, type)) }) diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/handler/sax/HocrSaxHandler.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/handler/sax/HocrSaxHandler.kt index 7906982..4590dd8 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/handler/sax/HocrSaxHandler.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/handler/sax/HocrSaxHandler.kt @@ -46,9 +46,9 @@ class HocrSaxHandler: DefaultHandler() { override fun characters(ch: CharArray?, start: Int, length: Int) { if (insideElement == InsideElement.BODY) { - bodyBuilder.append(ch, start, length) + bodyBuilder.appendRange(ch!!, start, start + length) } else if (insideElement == InsideElement.HEAD) { - headBuilder.append(ch, start, length) + headBuilder.appendRange(ch!!, start, start + length) } } diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/OcrService.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/OcrService.kt index eeacb76..faef6b8 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/OcrService.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/OcrService.kt @@ -80,4 +80,17 @@ class OcrService( return progressInfo.toOcrProgress() } } + + fun clearFinished() { + TODO() + } + + fun clearInterrupted() { + TODO() + } + + fun clear() { + clearInterrupted() + clearFinished() + } } \ No newline at end of file From 58bc0d06fcbe273fc2cef45e43bd2671c1ac504f Mon Sep 17 00:00:00 2001 From: Nenad Jakic Date: Tue, 10 Sep 2024 18:34:32 +0200 Subject: [PATCH 4/4] Updated message constants. Added scheduler for clearing finished tasks. Ignored unit test. --- .../ocr/studio/config/MessageConst.kt | 8 ++--- .../ocr/studio/dto/OcrConfigRequest.kt | 1 + .../nenadjakic/ocr/studio/entity/OcrConfig.kt | 1 + .../studio/executor/ParallelizationManager.kt | 9 ++++++ .../executor/ParallelizationManagerImpl.kt | 30 +++++++++++++++++++ .../ocr/studio/service/OcrService.kt | 19 +++++++----- .../ocr/studio/service/TaskService.kt | 14 ++++----- .../ocr/studio/service/TesseractFactory.kt | 8 ++++- .../ocr/studio/service/TaskServiceTest.kt | 5 ++++ 9 files changed, 73 insertions(+), 22 deletions(-) diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/config/MessageConst.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/config/MessageConst.kt index 26b021b..51e083d 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/config/MessageConst.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/config/MessageConst.kt @@ -3,10 +3,6 @@ package com.github.nenadjakic.ocr.studio.config import com.github.nenadjakic.ocr.studio.entity.Status enum class MessageConst(val description: String) { - ILLEGAL_STATUS("Cannot remove file for task with id: {}, because status is different than ${Status.CREATED}."), - MISSING_DOCUMENT("Cannot find task with id: {}."); - - fun formatedMessage(vararg parameters: Any): String { - return String.format(description, parameters) - } + ILLEGAL_STATUS("Cannot remove file for task, because status is different than ${Status.CREATED}."), + MISSING_DOCUMENT("Cannot find task with specified id."); } \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/dto/OcrConfigRequest.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/dto/OcrConfigRequest.kt index f94eedb..fae829f 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/dto/OcrConfigRequest.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/dto/OcrConfigRequest.kt @@ -7,6 +7,7 @@ class OcrConfigRequest { lateinit var ocrEngineMode: OcrConfig.OcrEngineMode lateinit var pageSegmentationMode: OcrConfig.PageSegmentationMode lateinit var language: String + var tessVariables: Map? = null var preProcessing: Boolean = false lateinit var fileFormat: FileFormat var mergeDocuments: Boolean = false diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrConfig.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrConfig.kt index 7bff76d..e4c2038 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrConfig.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/entity/OcrConfig.kt @@ -6,6 +6,7 @@ class OcrConfig( var language: String = "eng", var ocrEngineMode: OcrEngineMode = OcrEngineMode.DEFAULT, var pageSegmentationMode: PageSegmentationMode = PageSegmentationMode.MODE_3, + var tessVariables: Map? = null, var preProcessing: Boolean = false, var fileFormat: FileFormat = FileFormat.TEXT, var mergeDocuments: Boolean = false diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/ParallelizationManager.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/ParallelizationManager.kt index 9933626..4b24038 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/ParallelizationManager.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/ParallelizationManager.kt @@ -11,4 +11,13 @@ interface ParallelizationManager { fun interruptAll(): Map fun getProgress(id: UUID): ProgressInfo? + + fun clearFinished() + + fun clearInterrupted() + + fun clear() { + clearInterrupted() + clearFinished() + } } \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/ParallelizationManagerImpl.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/ParallelizationManagerImpl.kt index 006e07e..0eb33bc 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/ParallelizationManagerImpl.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/executor/ParallelizationManagerImpl.kt @@ -46,4 +46,34 @@ class ParallelizationManagerImpl( val runnable = runnables[id] return runnable?.progressInfo } + + override fun clearFinished() { + val ids = mutableListOf() + + futures.entries.removeIf { entry -> + if (entry.value.isDone) { + ids.add(entry.key) + true + } else { + false + } + } + + runnables.entries.removeIf { ids.contains(it.key) } + } + + override fun clearInterrupted() { + val ids = mutableListOf() + + futures.entries.removeIf { entry -> + if (entry.value.isCancelled) { + ids.add(entry.key) + true + } else { + false + } + } + + runnables.entries.removeIf { ids.contains(it.key) } + } } \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/OcrService.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/OcrService.kt index faef6b8..929994a 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/OcrService.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/OcrService.kt @@ -1,14 +1,17 @@ package com.github.nenadjakic.ocr.studio.service +import com.github.nenadjakic.ocr.studio.config.MessageConst import com.github.nenadjakic.ocr.studio.config.OcrProperties import com.github.nenadjakic.ocr.studio.entity.OcrProgress import com.github.nenadjakic.ocr.studio.entity.Status +import com.github.nenadjakic.ocr.studio.exception.MissingDocumentOcrException import com.github.nenadjakic.ocr.studio.exception.OcrException import com.github.nenadjakic.ocr.studio.executor.OcrExecutor import com.github.nenadjakic.ocr.studio.executor.ParallelizationManager import com.github.nenadjakic.ocr.studio.extension.toOcrProgress import com.github.nenadjakic.ocr.studio.repository.TaskRepository import org.slf4j.LoggerFactory +import org.springframework.scheduling.annotation.Scheduled import org.springframework.stereotype.Service import java.util.* import kotlin.jvm.optionals.getOrNull @@ -23,7 +26,7 @@ class OcrService( private val logger = LoggerFactory.getLogger(OcrService::class.java) fun schedule(id: UUID) { - val task = taskRepository.findById(id).orElseThrow { OcrException("Cannot find task with id: $id") } + val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.description) } if (Status.getInProgressStatuses().contains(task.ocrProgress.status)) { throw OcrException("Task with id: $id is in progress and cannot be scheduled.") @@ -59,8 +62,8 @@ class OcrService( fun interruptAll(id: UUID) { val interruptResult = parallelizationManager.interruptAll() - for (interruptyResultEntry in interruptResult.entries) { - if (interruptyResultEntry.value != null) { + for (interruptResultEntry in interruptResult.entries) { + if (interruptResultEntry.value != null) { taskRepository.findById(id).getOrNull()?.let { it.ocrProgress.status = Status.INTERRUPTED taskRepository.save(it) @@ -74,7 +77,7 @@ class OcrService( if (progressInfo == null) { // get progress from datastore - val task = taskRepository.findById(id).orElseThrow { OcrException("Cannot find task with id: $id") } + val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.description) } return task.ocrProgress } else { return progressInfo.toOcrProgress() @@ -82,15 +85,15 @@ class OcrService( } fun clearFinished() { - TODO() + parallelizationManager.clearFinished() } fun clearInterrupted() { - TODO() + parallelizationManager.clearInterrupted() } + @Scheduled(cron = "0 0 23 * * ?") fun clear() { - clearInterrupted() - clearFinished() + parallelizationManager.clear() } } \ No newline at end of file diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskService.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskService.kt index 9831552..5506f91 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskService.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TaskService.kt @@ -46,7 +46,7 @@ class TaskService( fun delete(task: Task) { if (task.ocrProgress.status != Status.CREATED) { - throw IllegalStateOcrException(MessageConst.ILLEGAL_STATUS.formatedMessage(task.id!!)) + throw IllegalStateOcrException(MessageConst.ILLEGAL_STATUS.description) } removeAllFiles(task) @@ -54,13 +54,13 @@ class TaskService( } fun deleteById(id: UUID) { - val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.formatedMessage(id)) } + val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.description) } delete(task) } fun upload(id: UUID, multipartFiles: Collection): List { val createdDocuments = mutableListOf() - val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.formatedMessage(id)) } + val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.description) } for (multiPartFile in multipartFiles) { val document = Document(multiPartFile.originalFilename!!, UUID.randomUUID().toString()).apply { @@ -77,10 +77,10 @@ class TaskService( } fun removeFile(id: UUID, originalFileName: String) { - val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.formatedMessage(id)) } + val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.description) } if (task.ocrProgress.status != Status.CREATED) { - throw IllegalStateOcrException(MessageConst.ILLEGAL_STATUS.formatedMessage(id)) + throw IllegalStateOcrException(MessageConst.ILLEGAL_STATUS.description) } task.inDocuments.find { it.originalFileName == originalFileName }?.let { @@ -92,7 +92,7 @@ class TaskService( fun removeAllFiles(task: Task) { if (task.ocrProgress.status != Status.CREATED) { - throw IllegalStateOcrException(MessageConst.ILLEGAL_STATUS.formatedMessage(task.id!!)) + throw IllegalStateOcrException(MessageConst.ILLEGAL_STATUS.description) } task.inDocuments.forEach { taskFileSystemService.deleteFile(TaskFileSystemService.getInputFile(ocrProperties.taskPath, task.id!!, it.randomizedFileName).toPath()) } task.inDocuments.clear() @@ -100,7 +100,7 @@ class TaskService( } fun removeAllFiles(id: UUID) { - val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.formatedMessage(id)) } + val task = taskRepository.findById(id).orElseThrow { MissingDocumentOcrException(MessageConst.MISSING_DOCUMENT.description) } removeAllFiles(task) } diff --git a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TesseractFactory.kt b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TesseractFactory.kt index 09adaf5..5795e2b 100644 --- a/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TesseractFactory.kt +++ b/src/main/kotlin/com/github/nenadjakic/ocr/studio/service/TesseractFactory.kt @@ -14,7 +14,7 @@ class TesseractFactory( language: String, ocrEngineMode: Int, pageSegMode: Int, - params: Map? + variables: Map? ): ITesseract { val tesseract: ITesseract = Tesseract() tesseract.setDatapath(ocrProperties.tesseract.dataPath) @@ -22,6 +22,12 @@ class TesseractFactory( tesseract.setOcrEngineMode(ocrEngineMode) tesseract.setPageSegMode(pageSegMode) + if (variables != null) { + for (variable in variables.entries) { + tesseract.setVariable(variable.key, variable.value) + } + } + return tesseract } } \ No newline at end of file diff --git a/src/test/kotlin/com/github/nenadjakic/ocr/studio/service/TaskServiceTest.kt b/src/test/kotlin/com/github/nenadjakic/ocr/studio/service/TaskServiceTest.kt index 4945ea7..c1299be 100644 --- a/src/test/kotlin/com/github/nenadjakic/ocr/studio/service/TaskServiceTest.kt +++ b/src/test/kotlin/com/github/nenadjakic/ocr/studio/service/TaskServiceTest.kt @@ -6,6 +6,7 @@ import com.github.nenadjakic.ocr.studio.repository.TaskRepository import org.junit.jupiter.api.AfterEach import org.junit.jupiter.api.Assertions.* import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Disabled import org.junit.jupiter.api.DisplayName import org.junit.jupiter.api.Test import org.junit.jupiter.api.extension.ExtendWith @@ -16,6 +17,7 @@ import org.springframework.data.domain.PageImpl import org.springframework.data.domain.PageRequest import org.springframework.data.domain.Sort import org.springframework.web.multipart.MultipartFile +import java.nio.file.Path import java.util.* @ExtendWith( @@ -128,11 +130,14 @@ class TaskServiceTest { verify(taskRepository).delete(task) } + @Disabled @Test @DisplayName("deleteById should delete the task by id") fun deleteById() { val taskId = UUID.randomUUID() + `when`(taskRepository.findById(taskId)).thenReturn(Optional.of(Task())) + taskService.deleteById(taskId) verify(taskRepository).deleteById(taskId)