From 80b481e0ba20bfaf283cd33a3386846c798ad839 Mon Sep 17 00:00:00 2001 From: vilka Date: Fri, 24 Nov 2023 01:07:59 +0300 Subject: [PATCH 1/2] add parsed text to DB --- app/db/db_types.py | 1 + app/main/reports/parse_file/parse_file.py | 28 +++++++++++++++++++++++ app/server.py | 14 +++++++++++- 3 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 app/main/reports/parse_file/parse_file.py diff --git a/app/db/db_types.py b/app/db/db_types.py index eeeb26d9..3cf31a32 100644 --- a/app/db/db_types.py +++ b/app/db/db_types.py @@ -104,6 +104,7 @@ def __init__(self, dictionary=None): self.is_failed = dictionary.get('is_failed', None) self.is_ended = dictionary.get('is_ended', True) self.is_passed = dictionary.get('is_passed', int(self.score) == 1) + self.parsed_chapters = dictionary.get('parsed_chapters', []) def calc_score(self): # check after implementation criterion pack diff --git a/app/main/reports/parse_file/parse_file.py b/app/main/reports/parse_file/parse_file.py new file mode 100644 index 00000000..41b8bc88 --- /dev/null +++ b/app/main/reports/parse_file/parse_file.py @@ -0,0 +1,28 @@ +import re + + +def parse_headers_and_pages(chapters, docx): + text_on_page = docx.pdf_file.get_text_on_page() + for page, text in text_on_page.items(): + text = re.sub(r"(-\n)", "", text) + text = re.sub(r"\s\n", " ", text) + if "СОДЕРЖАНИЕ" in text: + continue + for chapter in chapters: + if chapter["header"] in text: + chapter["start_page"] = page + return chapters + + +def parse_chapters(docx): + chapters = [] + for chapter in docx.chapters: + head = chapter["styled_text"]["text"] + if "ПРИЛОЖЕНИЕ" in head: + head = head.split(".")[0] + if chapter["child"] != [] and "heading" in chapter["style"]: + temp_text = "" + for i in range(len(chapter["child"])): + temp_text += chapter["child"][i]["styled_text"]["text"] + chapters.append({"header": head, "start_page": 0, "text": temp_text}) + return chapters diff --git a/app/server.py b/app/server.py index 1bfe9127..d1dd054d 100644 --- a/app/server.py +++ b/app/server.py @@ -18,6 +18,7 @@ from flask_recaptcha import ReCaptcha import servants.user as user +from app.main.reports.docx_uploader import DocxUploader from app.utils import format_check_for_table, check_file from db import db_methods from db.db_types import Check @@ -29,6 +30,7 @@ from servants import pre_luncher from tasks import create_task from utils import checklist_filter, decorator_assertion, get_file_len, format_check +from main.reports.parse_file.parse_file import parse_chapters, parse_headers_and_pages logger = get_root_logger('web') UPLOAD_FOLDER = '/usr/src/project/files' @@ -213,6 +215,15 @@ def run_task(): converted_id = db_methods.add_file_to_db(filenamepdf, filepathpdf) else: converted_id = db_methods.write_pdf(filename, filepath) + + parsed_file = DocxUploader() + parsed_file.upload(filepath) + parsed_file.parse() + parsed_file.make_chapters("VKR") + parsed_file.make_headers("VKR") + chapters = parse_chapters(parsed_file) + chapters_with_headers = parse_headers_and_pages(chapters, parsed_file) + check = Check({ '_id': file_id, 'conv_pdf_fs_id': converted_id, @@ -225,7 +236,8 @@ def run_task(): 'score': -1, # score=-1 -> checking in progress 'is_ended': False, 'is_failed': False, - 'params_for_passback': current_user.params_for_passback + 'params_for_passback': current_user.params_for_passback, + 'parsed_chapters': chapters_with_headers }) db_methods.add_check(file_id, check) # add check for parsed_file to db task = create_task.delay(check.pack(to_str=True)) # add check to queue From 7057bbbd5df8a2e6bd718cef6c972136c49a824a Mon Sep 17 00:00:00 2001 From: vilka Date: Fri, 19 Apr 2024 16:22:45 +0300 Subject: [PATCH 2/2] add parsed_texts collection && add ParsedText in db_types.py && move text parsing to create_task --- app/db/db_methods.py | 9 ++++++++- app/db/db_types.py | 9 +++++++++ app/server.py | 13 +------------ app/tasks.py | 16 ++++++++++++++-- 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/app/db/db_methods.py b/app/db/db_methods.py index b372d607..fa802e0b 100644 --- a/app/db/db_methods.py +++ b/app/db/db_methods.py @@ -18,6 +18,7 @@ checks_collection = db['checks'] consumers_collection = db['consumers'] criteria_pack_collection = db['criteria_pack'] +parsed_texts_collection = db['parsed_texts'] logs_collection = db.create_collection( 'logs', capped=True, size=5242880) if not db['logs'] else db['logs'] celery_check_collection = db['celery_check'] # collection for mapping celery_task to check @@ -144,6 +145,12 @@ def update_check(check): return bool(checks_collection.find_one_and_replace({'_id': check._id}, check.pack())) +def add_parsed_text(check_id, parsed_text): + checks_id = parsed_texts_collection.insert_one(parsed_text.pack()).inserted_id + files_info_collection.update_one({'_id': check_id}, {"$push": {'parsed_texts': checks_id}}) + return checks_id + + def write_pdf(filename, filepath): converted_filepath = convert_to(filepath, target_format='pdf') return add_file_to_db(filename, converted_filepath) @@ -228,7 +235,7 @@ def set_passbacked_flag(checks_id, flag): def get_latest_users_check(filter=None): local_filter = filter user = local_filter.get('user') - username_filter = {'username': user} if user else {} + username_filter = {'username': user} if user else {} all_users = [user['username'] for user in users_collection.find(username_filter, {'username': 1})] latest_checks = [] for user in all_users: diff --git a/app/db/db_types.py b/app/db/db_types.py index 3cf31a32..7966b999 100644 --- a/app/db/db_types.py +++ b/app/db/db_types.py @@ -3,6 +3,7 @@ from main.check_packs import BASE_PACKS, BaseCriterionPack, DEFAULT_TYPE_INFO, DEFAULT_REPORT_TYPE_INFO + class Packable: def __init__(self, dictionary): pass @@ -146,3 +147,11 @@ def none_to_false(x): is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False return {'is_ended': is_ended, 'is_failed': is_failed} + + +class ParsedText(PackableWithId): + def __init__(self, dictionary=None): + super().__init__(dictionary) + dictionary = dictionary or {} + self.filename = dictionary.get('filename', '') + self.parsed_chapters = [] diff --git a/app/server.py b/app/server.py index d1dd054d..b92caa35 100644 --- a/app/server.py +++ b/app/server.py @@ -6,7 +6,6 @@ from os.path import join from sys import argv from io import StringIO - import bson import pandas as pd from bson import ObjectId @@ -18,7 +17,6 @@ from flask_recaptcha import ReCaptcha import servants.user as user -from app.main.reports.docx_uploader import DocxUploader from app.utils import format_check_for_table, check_file from db import db_methods from db.db_types import Check @@ -30,7 +28,6 @@ from servants import pre_luncher from tasks import create_task from utils import checklist_filter, decorator_assertion, get_file_len, format_check -from main.reports.parse_file.parse_file import parse_chapters, parse_headers_and_pages logger = get_root_logger('web') UPLOAD_FOLDER = '/usr/src/project/files' @@ -216,14 +213,6 @@ def run_task(): else: converted_id = db_methods.write_pdf(filename, filepath) - parsed_file = DocxUploader() - parsed_file.upload(filepath) - parsed_file.parse() - parsed_file.make_chapters("VKR") - parsed_file.make_headers("VKR") - chapters = parse_chapters(parsed_file) - chapters_with_headers = parse_headers_and_pages(chapters, parsed_file) - check = Check({ '_id': file_id, 'conv_pdf_fs_id': converted_id, @@ -237,7 +226,7 @@ def run_task(): 'is_ended': False, 'is_failed': False, 'params_for_passback': current_user.params_for_passback, - 'parsed_chapters': chapters_with_headers + 'parsed_chapters': [] }) db_methods.add_check(file_id, check) # add check for parsed_file to db task = create_task.delay(check.pack(to_str=True)) # add check to queue diff --git a/app/tasks.py b/app/tasks.py index 1bee7882..0bb3b908 100644 --- a/app/tasks.py +++ b/app/tasks.py @@ -5,8 +5,9 @@ from celery import Celery import passback_grades +from app.main.reports.parse_file.parse_file import parse_headers_and_pages, parse_chapters from db import db_methods -from db.db_types import Check +from db.db_types import Check, ParsedText from main.checker import check from main.parser import parse from main.check_packs import BASE_PACKS @@ -41,10 +42,21 @@ def create_task(self, check_info): original_filepath = join(FILES_FOLDER, f"{check_id}.{check_obj.filename.rsplit('.', 1)[-1]}") pdf_filepath = join(FILES_FOLDER, f"{check_id}.pdf") try: - updated_check = check(parse(original_filepath, pdf_filepath), check_obj) + parsed_file_object = parse(original_filepath, pdf_filepath) + parsed_file_object.make_chapters(check_obj.file_type['report_type']) + parsed_file_object.make_headers(check_obj.file_type['report_type']) + chapters = parse_chapters(parsed_file_object) + + updated_check = check(parsed_file_object, check_obj) updated_check.is_ended = True updated_check.is_failed = False + updated_check.parsed_chapters = parse_headers_and_pages(chapters, parsed_file_object) + + parsed_text = ParsedText(check_info) + parsed_text.parsed_chapters = parse_headers_and_pages(chapters, parsed_file_object) + db_methods.update_check(updated_check) # save to db + db_methods.add_parsed_text(check_id, parsed_text) db_methods.mark_celery_task_as_finished(self.request.id) # remove files from FILES_FOLDER after checking