diff --git a/app/db/db_methods.py b/app/db/db_methods.py index b372d607..fa802e0b 100644 --- a/app/db/db_methods.py +++ b/app/db/db_methods.py @@ -18,6 +18,7 @@ checks_collection = db['checks'] consumers_collection = db['consumers'] criteria_pack_collection = db['criteria_pack'] +parsed_texts_collection = db['parsed_texts'] logs_collection = db.create_collection( 'logs', capped=True, size=5242880) if not db['logs'] else db['logs'] celery_check_collection = db['celery_check'] # collection for mapping celery_task to check @@ -144,6 +145,12 @@ def update_check(check): return bool(checks_collection.find_one_and_replace({'_id': check._id}, check.pack())) +def add_parsed_text(check_id, parsed_text): + checks_id = parsed_texts_collection.insert_one(parsed_text.pack()).inserted_id + files_info_collection.update_one({'_id': check_id}, {"$push": {'parsed_texts': checks_id}}) + return checks_id + + def write_pdf(filename, filepath): converted_filepath = convert_to(filepath, target_format='pdf') return add_file_to_db(filename, converted_filepath) @@ -228,7 +235,7 @@ def set_passbacked_flag(checks_id, flag): def get_latest_users_check(filter=None): local_filter = filter user = local_filter.get('user') - username_filter = {'username': user} if user else {} + username_filter = {'username': user} if user else {} all_users = [user['username'] for user in users_collection.find(username_filter, {'username': 1})] latest_checks = [] for user in all_users: diff --git a/app/db/db_types.py b/app/db/db_types.py index eeeb26d9..7966b999 100644 --- a/app/db/db_types.py +++ b/app/db/db_types.py @@ -3,6 +3,7 @@ from main.check_packs import BASE_PACKS, BaseCriterionPack, DEFAULT_TYPE_INFO, DEFAULT_REPORT_TYPE_INFO + class Packable: def __init__(self, dictionary): pass @@ -104,6 +105,7 @@ def __init__(self, dictionary=None): self.is_failed = dictionary.get('is_failed', None) self.is_ended = dictionary.get('is_ended', True) self.is_passed = dictionary.get('is_passed', int(self.score) == 1) + self.parsed_chapters = dictionary.get('parsed_chapters', []) def calc_score(self): # check after implementation criterion pack @@ -145,3 +147,11 @@ def none_to_false(x): is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False return {'is_ended': is_ended, 'is_failed': is_failed} + + +class ParsedText(PackableWithId): + def __init__(self, dictionary=None): + super().__init__(dictionary) + dictionary = dictionary or {} + self.filename = dictionary.get('filename', '') + self.parsed_chapters = [] diff --git a/app/main/reports/parse_file/parse_file.py b/app/main/reports/parse_file/parse_file.py new file mode 100644 index 00000000..41b8bc88 --- /dev/null +++ b/app/main/reports/parse_file/parse_file.py @@ -0,0 +1,28 @@ +import re + + +def parse_headers_and_pages(chapters, docx): + text_on_page = docx.pdf_file.get_text_on_page() + for page, text in text_on_page.items(): + text = re.sub(r"(-\n)", "", text) + text = re.sub(r"\s\n", " ", text) + if "СОДЕРЖАНИЕ" in text: + continue + for chapter in chapters: + if chapter["header"] in text: + chapter["start_page"] = page + return chapters + + +def parse_chapters(docx): + chapters = [] + for chapter in docx.chapters: + head = chapter["styled_text"]["text"] + if "ПРИЛОЖЕНИЕ" in head: + head = head.split(".")[0] + if chapter["child"] != [] and "heading" in chapter["style"]: + temp_text = "" + for i in range(len(chapter["child"])): + temp_text += chapter["child"][i]["styled_text"]["text"] + chapters.append({"header": head, "start_page": 0, "text": temp_text}) + return chapters diff --git a/app/server.py b/app/server.py index 1bfe9127..b92caa35 100644 --- a/app/server.py +++ b/app/server.py @@ -6,7 +6,6 @@ from os.path import join from sys import argv from io import StringIO - import bson import pandas as pd from bson import ObjectId @@ -213,6 +212,7 @@ def run_task(): converted_id = db_methods.add_file_to_db(filenamepdf, filepathpdf) else: converted_id = db_methods.write_pdf(filename, filepath) + check = Check({ '_id': file_id, 'conv_pdf_fs_id': converted_id, @@ -225,7 +225,8 @@ def run_task(): 'score': -1, # score=-1 -> checking in progress 'is_ended': False, 'is_failed': False, - 'params_for_passback': current_user.params_for_passback + 'params_for_passback': current_user.params_for_passback, + 'parsed_chapters': [] }) db_methods.add_check(file_id, check) # add check for parsed_file to db task = create_task.delay(check.pack(to_str=True)) # add check to queue diff --git a/app/tasks.py b/app/tasks.py index 1bee7882..0bb3b908 100644 --- a/app/tasks.py +++ b/app/tasks.py @@ -5,8 +5,9 @@ from celery import Celery import passback_grades +from app.main.reports.parse_file.parse_file import parse_headers_and_pages, parse_chapters from db import db_methods -from db.db_types import Check +from db.db_types import Check, ParsedText from main.checker import check from main.parser import parse from main.check_packs import BASE_PACKS @@ -41,10 +42,21 @@ def create_task(self, check_info): original_filepath = join(FILES_FOLDER, f"{check_id}.{check_obj.filename.rsplit('.', 1)[-1]}") pdf_filepath = join(FILES_FOLDER, f"{check_id}.pdf") try: - updated_check = check(parse(original_filepath, pdf_filepath), check_obj) + parsed_file_object = parse(original_filepath, pdf_filepath) + parsed_file_object.make_chapters(check_obj.file_type['report_type']) + parsed_file_object.make_headers(check_obj.file_type['report_type']) + chapters = parse_chapters(parsed_file_object) + + updated_check = check(parsed_file_object, check_obj) updated_check.is_ended = True updated_check.is_failed = False + updated_check.parsed_chapters = parse_headers_and_pages(chapters, parsed_file_object) + + parsed_text = ParsedText(check_info) + parsed_text.parsed_chapters = parse_headers_and_pages(chapters, parsed_file_object) + db_methods.update_check(updated_check) # save to db + db_methods.add_parsed_text(check_id, parsed_text) db_methods.mark_celery_task_as_finished(self.request.id) # remove files from FILES_FOLDER after checking