diff --git a/app/main/checks/report_checks/banned_words_in_literature.py b/app/main/checks/report_checks/banned_words_in_literature.py index 5671785c..6e9f8358 100644 --- a/app/main/checks/report_checks/banned_words_in_literature.py +++ b/app/main/checks/report_checks/banned_words_in_literature.py @@ -13,6 +13,7 @@ def __init__(self, file_info, banned_words=["wikipedia"]): self.literature_header = [] self.banned_words = [morph.normal_forms(word)[0] for word in banned_words] self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)' + self.md_name_pattern = r'

список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)

' def late_init_vkr(self): self.literature_header = self.file.find_literature_vkr(self.file_type['report_type']) @@ -83,6 +84,6 @@ def start_of_literature_chapter(self, ): start_index = 0 for i in range(len(self.file.paragraphs)): text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1] - if re.fullmatch(self.name_pattern, text_string): + if re.fullmatch(f'{self.name_pattern}|{self.md_name_pattern}', text_string): start_index = i return start_index diff --git a/app/main/checks/report_checks/image_references.py b/app/main/checks/report_checks/image_references.py index 7bcb6256..79e46fd7 100644 --- a/app/main/checks/report_checks/image_references.py +++ b/app/main/checks/report_checks/image_references.py @@ -63,7 +63,10 @@ def check(self): def search_references(self): array_of_references = set() for i in range(0, self.last_child_number): - detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text) + if isinstance(self.file.paragraphs[i], str): + detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i]) + else: + detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text) if detected_references: for reference in detected_references: for one_part in re.split(r'[Рр]ис\.|,| ', reference): diff --git a/app/main/checks/report_checks/literature_references.py b/app/main/checks/report_checks/literature_references.py index 3e8515a9..cdc529f3 100644 --- a/app/main/checks/report_checks/literature_references.py +++ b/app/main/checks/report_checks/literature_references.py @@ -12,6 +12,7 @@ def __init__(self, file_info, min_ref=1, max_ref=1000): self.headers = [] self.literature_header = [] self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)' + self.md_name_pattern = r"

(Список использованных источников|Список использованной литературы)<\/h2>" self.min_ref = min_ref self.max_ref = max_ref @@ -77,7 +78,10 @@ def check(self): def search_references(self, start_par): array_of_references = set() for i in range(0, start_par): - detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1]) + if isinstance(self.file.paragraphs[i], str): + detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i]) + else: + detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1]) if detected_references: for reference in detected_references: for one_part in re.split(r'[\[\],]', reference): @@ -92,10 +96,16 @@ def search_references(self, start_par): def find_start_paragraph(self): start_index = 0 for i in range(len(self.file.paragraphs)): - text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1] - if re.fullmatch(self.name_pattern, text_string): - start_index = i - break + if isinstance(self.file.paragraphs[i], str): + text_string = self.file.paragraphs[i].lower() + if re.fullmatch(self.md_name_pattern, text_string): + start_index = i + break + else: + text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1] + if re.fullmatch(self.name_pattern, text_string): + start_index = i + break return start_index def count_sources_vkr(self, header): @@ -142,4 +152,4 @@ def search_literature_start_pdf(self): if re.search('приложение а[\n .]', lowercase_str): end_page = i break - return start_page, end_page \ No newline at end of file + return start_page, end_page diff --git a/app/main/checks/report_checks/table_references.py b/app/main/checks/report_checks/table_references.py index d390872b..43aa51b9 100644 --- a/app/main/checks/report_checks/table_references.py +++ b/app/main/checks/report_checks/table_references.py @@ -63,7 +63,10 @@ def check(self): def search_references(self): array_of_references = set() for i in range(0, self.last_child_number): - detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text) + if isinstance(self.file.paragraphs[i], str): + detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i]) + else: + detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text) if detected_references: for reference in detected_references: for one_part in re.split(r'таблиц[аеыу]| ', reference): diff --git a/app/main/parser.py b/app/main/parser.py index 5cf671de..b185af4c 100644 --- a/app/main/parser.py +++ b/app/main/parser.py @@ -5,6 +5,7 @@ from main.presentations import PresentationPPTX from main.reports.docx_uploader import DocxUploader +from main.reports.md_uploader import MdUpload from utils import convert_to logger = logging.getLogger('root_logger') @@ -19,15 +20,24 @@ def parse(filepath, pdf_filepath): logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.") new_filepath = convert_to(filepath, target_format='pptx') file_object = PresentationPPTX(new_filepath) - elif tmp_filepath.endswith(('.doc', '.odt', '.docx')): + elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )): new_filepath = filepath if tmp_filepath.endswith(('.doc', '.odt')): logger.info(f"Отчёт {filepath} старого формата. Временно преобразован в docx для обработки.") new_filepath = convert_to(filepath, target_format='docx') + docx = DocxUploader() docx.upload(new_filepath, pdf_filepath) docx.parse() file_object = docx + + elif tmp_filepath.endswith('.md' ): + new_filepath = filepath + doc = MdUpload(new_filepath) + md_text = doc.upload() + doc.parse(md_text) + file_object = doc + else: raise ValueError("Файл с недопустимым именем или недопустимого формата: " + filepath) # Если была конвертация, то удаляем временный файл. diff --git a/app/main/reports/README.md b/app/main/reports/README.md index 4c591479..d54bd128 100644 --- a/app/main/reports/README.md +++ b/app/main/reports/README.md @@ -66,11 +66,6 @@ Proof-of-concept парсинг файлов `.docx` с выводом стру $ python3 -m app.main.mse22.pdf_document text_from_pages --filename path_to_file ``` -## `MD` - -Парсинг файлов `.md` с выводом структуры файла в текстовом виде в stdout. - ```bash $ python3 -m app.main.reports.md_uploader md_parser --mdfile path_to_md_file -``` - +``` \ No newline at end of file diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py new file mode 100644 index 00000000..a67600f0 --- /dev/null +++ b/app/main/reports/document_uploader.py @@ -0,0 +1,35 @@ +from abc import ABC, abstractmethod + +class DocumentUploader(ABC): + + @abstractmethod + def upload(self): + pass + + @abstractmethod + def parse(self): + pass + + @abstractmethod + def parse_effective_styles(self): + pass + + @abstractmethod + def page_counter(self): + pass + + @abstractmethod + def make_headers(self, work_type): + pass + + @abstractmethod + def make_chapters(self, work_type): + pass + + @abstractmethod + def find_header_page(self, work_type): + pass + + @abstractmethod + def find_literature_vkr(self, work_type): + pass diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py index 14efdc05..93e5e7a3 100644 --- a/app/main/reports/docx_uploader/docx_uploader.py +++ b/app/main/reports/docx_uploader/docx_uploader.py @@ -9,9 +9,10 @@ from .style import Style from .table import Table, Cell from ..pdf_document.pdf_document_manager import PdfDocumentManager +from ..document_uploader import DocumentUploader -class DocxUploader: +class DocxUploader(DocumentUploader): def __init__(self): self.inline_shapes = [] self.core_properties = None diff --git a/app/main/reports/md_uploader/md_uploader.py b/app/main/reports/md_uploader/md_uploader.py index 926052ea..25346ffc 100644 --- a/app/main/reports/md_uploader/md_uploader.py +++ b/app/main/reports/md_uploader/md_uploader.py @@ -1,42 +1,206 @@ +'''Available checks for md-file: +pack "BaseReportCriterionPackMd" + +[ + [ + "simple_check" + ], + [ + "banned_words_in_literature" + ], + [ + "page_counter" + ], + [ + "short_sections_check" + ], + [ + "banned_words_check" + ], + [ + "right_words_check" + ], + [ + "banned_words_in_literature" + ], + [ + "literature_references" + ], + [ + "image_references" + ], + [ + "table_references" + ], + [ + "first_pages_check" + ], + [ + "main_character_check" + ], + [ + "needed_headers_check" + ], + [ + "report_section_component" + ], + [ + "spelling_check" + ] +] +''' + import markdown #installation: pip install markdown +from md2pdf.core import md2pdf #installation: pip install md2pdf import re +# from functools import reduce +from PIL import Image +from io import BytesIO +import requests + +# from ..docx_uploader.inline_shape import InlineShape +from ..document_uploader import DocumentUploader +from ..pdf_document.pdf_document_manager import PdfDocumentManager + -class MdUpload: +class MdUpload(DocumentUploader): def __init__(self, path_to_md_file): + self.pdf_file = None self.path_to_md_file = path_to_md_file + self.paragraphs = [] + self.headers_main = [] self.headers = [] self.chapters = [] - self.paragraphs = [] self.html_text = '' + self.count = 0 self.tables = [] self.chapter_with_text = [] + self.literature_header = [] + self.headers_page = 1 + self.styled_paragraphs = [] + self.first_lines = [] + self.inline_shapes = [] - def read_md_file(self): + def upload(self): with open(self.path_to_md_file, "r", encoding="utf-8") as f: md_text = f.read() return md_text - def get_html_from_md(self, md_text): - self.html_text = markdown.markdown(md_text) - self.paragraphs = self.html_text.split('\n') + def parse(self, md_text): + self.html_text = markdown.markdown(md_text) + self.paragraphs = self.make_paragraphs(self.html_text) + self.parse_effective_styles() + self.pdf_filepath = self.path_to_md_file.split('.')[0]+'.pdf' + self.pdf_file = PdfDocumentManager(self.path_to_md_file, md2pdf(self.pdf_filepath, md_file_path=self.path_to_md_file)) + + def make_paragraphs(self, html_text): + self.paragraphs = html_text.split('\n') + return self.paragraphs + + def page_counter(self): + if not self.count: + for k, v in self.pdf_file.text_on_page.items(): + line = v[:20] if len(v) > 21 else v + if re.search('ПРИЛОЖЕНИЕ [А-Я]', line.strip()): + break + self.count += 1 + line = '' + lines = v.split("\n") + for i in range(len(lines)): + if i > 1: + break + if i > 0: + line += " " + line += lines[i].strip() + self.first_lines.append(line.lower()) + return self.count + + def get_main_headers(self): + header_main_regex = "

(.*?)<\/h1>" + self.headers_main = re.findall(header_main_regex, self.html_text) - def get_headers(self): - header_regex = "

(.*?)<\/h1>" - self.headers = re.findall(header_regex, self.html_text) + def make_headers(self, work_type): + if not self.headers: + if work_type == 'VKR': + # find first pages + headers = [ + {"name": "Титульный лист", "marker": False, "key": "санкт-петербургский государственный", + "main_character": True, "page": 0}, + {"name": "Задание на выпускную квалификационную работу", "marker": False, "key": "задание", + "main_character": True, "page": 0}, + {"name": "Календарный план", "marker": False, "key": "календарный план", "main_character": True, + "page": 0}, + {"name": "Реферат", "marker": False, "key": "реферат", "main_character": False, "page": 0}, + {"name": "Abstract", "marker": False, "key": "abstract", "main_character": False, "page": 0}, + {"name": "Содержание", "marker": False, "key": "содержание", "main_character": False, "page": 0}] + for page in range(1, self.count if self.page_counter() < 2 * len(headers) else 2 * len(headers)): + page_text = (self.pdf_file.get_text_on_page()[page].lower()) + for i in range(len(headers)): + if not headers[i]["marker"]: + if page_text.find(headers[i]["key"]) >= 0: + headers[i]["marker"] = True + headers[i]["page"] = page + break + self.headers = headers + return self.headers - def get_chapters(self): - chapter_regex = "

(.*?)<\/h2>" - self.chapters = re.findall(chapter_regex, self.html_text) + def parse_effective_styles(self): + for par in self.paragraphs: + if len(par.strip()) > 0: + paragraph = {"text": par, "runs": []} + if '

' in paragraph["text"]: + paragraph["runs"].append({"text": par, "style": "heading 2"}) + elif 'Таблица' in paragraph["text"]: + if '|' in self.paragraphs[self.paragraphs.index(par)+1]: + paragraph['runs'].append({"text": par, "style": "вкр_подпись таблицы"}) + elif ' in paragraph[= 0: + par_num += 1 + self.chapters[header_ind]["child"].append( + {"style": style_name, "text": self.styled_paragraphs[par_ind]["text"], + "styled_text": self.styled_paragraphs[par_ind], "number": head_par_ind}) + return self.chapters - def get_chapter_with_text(self): - text = self.html_text - chapter_name = '' - for chapter in self.chapters: - self.split_chapter = text.split("

" + chapter + "

") - self.chapter_with_text.append(chapter_name + self.split_chapter[-2]) - chapter_name = chapter - text = self.split_chapter[-1] - self.chapter_with_text.append(chapter_name + text) + def find_images(self): + total_height = 0 + images = [k['runs'][0]['text'] for k in self.styled_paragraphs if k['runs'][0]['style'] == 'рисунок'] + images_regex = '(https://[\S]+\.(jpg|png))+' + images_links = [re.findall(images_regex, k)[0][0] for k in images if re.findall(images_regex, k)] + for link in images_links: + response = requests.get(link) + image = Image.open(BytesIO(response.content)) + dpi_image = image.info.get("dpi", (72, 72)) + width, height = round((image.width/dpi_image[0])*2.54, 3), round((image.height/dpi_image[1])*2.54, 3) + total_height += width + self.inline_shapes.append((width, height)) + return self.inline_shapes def get_tables_size(self): count_table_line = 0 @@ -46,16 +210,28 @@ def get_tables_size(self): count_table_line +=1 return round(count_table_line/count_paragraph, 4) + def find_literature_vkr(self, work_type): + if not self.literature_header: + for header in self.make_chapters(work_type): + header_text = header["text"].lower() + if header_text.find('список использованных источников') >= 0: + self.literature_header = header + return self.literature_header + + def find_header_page(self, work_type): + return self.headers_page + def parse_md_file(self): - md_text = self.read_md_file() - self.get_html_from_md(md_text) - self.get_headers() - self.get_chapters() - self.get_chapter_with_text() + md_text = self.upload() + self.parse(md_text) + self.make_headers(work_type="VKR") self.get_tables_size() - return f"Заголовки:\n{self.headers}\n\nГлавы:\n{self.chapters}\n\nГлавы с текстом:\n{self.chapter_with_text}\n\nДоля таблиц в тексте:\n{self.get_tables_size()}" + self.make_chapters(work_type="VKR") + self.find_images() + self.find_literature_vkr(work_type="VKR") + return f"Заголовки:\n{self.headers_main}\n\nГлавы\n{self.chapters}\n\nИзображения:\n\n{self.inline_shapes}" + def main(args): md_file = MdUpload(args.mdfile) print(md_file.parse_md_file()) - diff --git a/app/main/reports/pdf_document/pdf_document_manager.py b/app/main/reports/pdf_document/pdf_document_manager.py index ddc125e0..1500756f 100644 --- a/app/main/reports/pdf_document/pdf_document_manager.py +++ b/app/main/reports/pdf_document/pdf_document_manager.py @@ -4,7 +4,7 @@ class PdfDocumentManager: - def __init__(self, path_to_file, pdf_filepath=''): + def __init__(self, path_to_file, pdf_filepath): if not pdf_filepath: self.pdf_file = pdfplumber.open(convert_to(path_to_file, target_format='pdf')) else: diff --git a/app/server.py b/app/server.py index 1bfe9127..edbe6284 100644 --- a/app/server.py +++ b/app/server.py @@ -34,8 +34,9 @@ UPLOAD_FOLDER = '/usr/src/project/files' ALLOWED_EXTENSIONS = { 'pres': {'ppt', 'pptx', 'odp'}, - 'report': {'doc', 'odt', 'docx'} + 'report': {'doc', 'odt', 'docx', 'md'} } + DOCUMENT_TYPES = {'Лабораторная работа', 'Курсовая работа', 'ВКР'} TABLE_COLUMNS = ['Solution', 'User', 'File', 'Criteria', 'Check added', 'LMS date', 'Score'] URL_DOMEN = os.environ.get('URL_DOMEN', f"http://localhost:{os.environ.get('WEB_PORT', 8080)}") diff --git a/app/utils/check_file.py b/app/utils/check_file.py index c559ae54..309abfd7 100644 --- a/app/utils/check_file.py +++ b/app/utils/check_file.py @@ -3,9 +3,13 @@ def check_file(file, file_extension, allowed_extensions, check_mime=True): if not file_extension in allowed_extensions: return "not_allowed_extension" + + if check_mime: + if file_extension == 'md': + if file.mimetype != 'text/plain': + return "mime_type_does_not_match_extension" + else: + if file_extension != filetype.guess_extension(file): + return "mime_type_does_not_match_extension" - # Проверяем MIME тип (библиотека автоматически умеет переводить MIME в реальное расширение файла). - if check_mime and file_extension != filetype.guess_extension(file): - return "mime_type_does_not_match_extension" - - return "ok" \ No newline at end of file + return "ok" diff --git a/requirements.txt b/requirements.txt index 0d87cf6c..226de495 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,4 +27,5 @@ pdfplumber==0.6.1 pytest~=7.1.2 filetype==1.2.0 language-tool-python==2.7.1 +md2pdf==1.0.1 markdown==3.4.4