Skip to content

Commit

Permalink
add parsed text to DB
Browse files Browse the repository at this point in the history
  • Loading branch information
blindsphynx committed Feb 4, 2024
1 parent 342728e commit 5207285
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 21 deletions.
20 changes: 0 additions & 20 deletions .env_example

This file was deleted.

1 change: 1 addition & 0 deletions app/db/db_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def __init__(self, dictionary=None):
self.is_failed = dictionary.get('is_failed', None)
self.is_ended = dictionary.get('is_ended', True)
self.is_passed = dictionary.get('is_passed', int(self.score) == 1)
self.parsed_chapters = dictionary.get('parsed_chapters', [])

def calc_score(self):
# check after implementation criterion pack
Expand Down
28 changes: 28 additions & 0 deletions app/main/reports/parse_file/parse_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import re


def parse_headers_and_pages(chapters, docx):
text_on_page = docx.pdf_file.get_text_on_page()
for page, text in text_on_page.items():
text = re.sub(r"(-\n)", "", text)
text = re.sub(r"\s\n", " ", text)
if "СОДЕРЖАНИЕ" in text:
continue
for chapter in chapters:
if chapter["header"] in text:
chapter["start_page"] = page
return chapters


def parse_chapters(docx):
chapters = []
for chapter in docx.chapters:
head = chapter["styled_text"]["text"]
if "ПРИЛОЖЕНИЕ" in head:
head = head.split(".")[0]
if chapter["child"] != [] and "heading" in chapter["style"]:
temp_text = ""
for i in range(len(chapter["child"])):
temp_text += chapter["child"][i]["styled_text"]["text"]
chapters.append({"header": head, "start_page": 0, "text": temp_text})
return chapters
14 changes: 13 additions & 1 deletion app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from flask_recaptcha import ReCaptcha

import servants.user as user
from app.main.reports.docx_uploader import DocxUploader
from app.utils import format_check_for_table, check_file
from db import db_methods
from db.db_types import Check
Expand All @@ -29,6 +30,7 @@
from servants import pre_luncher
from tasks import create_task
from utils import checklist_filter, decorator_assertion, get_file_len, format_check
from main.reports.parse_file.parse_file import parse_chapters, parse_headers_and_pages

logger = get_root_logger('web')
UPLOAD_FOLDER = '/usr/src/project/files'
Expand Down Expand Up @@ -213,6 +215,15 @@ def run_task():
converted_id = db_methods.add_file_to_db(filenamepdf, filepathpdf)
else:
converted_id = db_methods.write_pdf(filename, filepath)

parsed_file = DocxUploader()
parsed_file.upload(filepath)
parsed_file.parse()
parsed_file.make_chapters("VKR")
parsed_file.make_headers("VKR")
chapters = parse_chapters(parsed_file)
chapters_with_headers = parse_headers_and_pages(chapters, parsed_file)

check = Check({
'_id': file_id,
'conv_pdf_fs_id': converted_id,
Expand All @@ -225,7 +236,8 @@ def run_task():
'score': -1, # score=-1 -> checking in progress
'is_ended': False,
'is_failed': False,
'params_for_passback': current_user.params_for_passback
'params_for_passback': current_user.params_for_passback,
'parsed_chapters': chapters_with_headers
})
db_methods.add_check(file_id, check) # add check for parsed_file to db
task = create_task.delay(check.pack(to_str=True)) # add check to queue
Expand Down

0 comments on commit 5207285

Please sign in to comment.