diff --git a/app/main/checks/report_checks/banned_words_in_literature.py b/app/main/checks/report_checks/banned_words_in_literature.py
index 5671785c..6e9f8358 100644
--- a/app/main/checks/report_checks/banned_words_in_literature.py
+++ b/app/main/checks/report_checks/banned_words_in_literature.py
@@ -13,6 +13,7 @@ def __init__(self, file_info, banned_words=["wikipedia"]):
self.literature_header = []
self.banned_words = [morph.normal_forms(word)[0] for word in banned_words]
self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)'
+ self.md_name_pattern = r'
список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)
'
def late_init_vkr(self):
self.literature_header = self.file.find_literature_vkr(self.file_type['report_type'])
@@ -83,6 +84,6 @@ def start_of_literature_chapter(self, ):
start_index = 0
for i in range(len(self.file.paragraphs)):
text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
- if re.fullmatch(self.name_pattern, text_string):
+ if re.fullmatch(f'{self.name_pattern}|{self.md_name_pattern}', text_string):
start_index = i
return start_index
diff --git a/app/main/checks/report_checks/image_references.py b/app/main/checks/report_checks/image_references.py
index 7bcb6256..79e46fd7 100644
--- a/app/main/checks/report_checks/image_references.py
+++ b/app/main/checks/report_checks/image_references.py
@@ -63,7 +63,10 @@ def check(self):
def search_references(self):
array_of_references = set()
for i in range(0, self.last_child_number):
- detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text)
+ if isinstance(self.file.paragraphs[i], str):
+ detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i])
+ else:
+ detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text)
if detected_references:
for reference in detected_references:
for one_part in re.split(r'[Рр]ис\.|,| ', reference):
diff --git a/app/main/checks/report_checks/literature_references.py b/app/main/checks/report_checks/literature_references.py
index 3e8515a9..cdc529f3 100644
--- a/app/main/checks/report_checks/literature_references.py
+++ b/app/main/checks/report_checks/literature_references.py
@@ -12,6 +12,7 @@ def __init__(self, file_info, min_ref=1, max_ref=1000):
self.headers = []
self.literature_header = []
self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)'
+ self.md_name_pattern = r"(Список использованных источников|Список использованной литературы)<\/h2>"
self.min_ref = min_ref
self.max_ref = max_ref
@@ -77,7 +78,10 @@ def check(self):
def search_references(self, start_par):
array_of_references = set()
for i in range(0, start_par):
- detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1])
+ if isinstance(self.file.paragraphs[i], str):
+ detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i])
+ else:
+ detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1])
if detected_references:
for reference in detected_references:
for one_part in re.split(r'[\[\],]', reference):
@@ -92,10 +96,16 @@ def search_references(self, start_par):
def find_start_paragraph(self):
start_index = 0
for i in range(len(self.file.paragraphs)):
- text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
- if re.fullmatch(self.name_pattern, text_string):
- start_index = i
- break
+ if isinstance(self.file.paragraphs[i], str):
+ text_string = self.file.paragraphs[i].lower()
+ if re.fullmatch(self.md_name_pattern, text_string):
+ start_index = i
+ break
+ else:
+ text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
+ if re.fullmatch(self.name_pattern, text_string):
+ start_index = i
+ break
return start_index
def count_sources_vkr(self, header):
@@ -142,4 +152,4 @@ def search_literature_start_pdf(self):
if re.search('приложение а[\n .]', lowercase_str):
end_page = i
break
- return start_page, end_page
\ No newline at end of file
+ return start_page, end_page
diff --git a/app/main/checks/report_checks/table_references.py b/app/main/checks/report_checks/table_references.py
index d390872b..43aa51b9 100644
--- a/app/main/checks/report_checks/table_references.py
+++ b/app/main/checks/report_checks/table_references.py
@@ -63,7 +63,10 @@ def check(self):
def search_references(self):
array_of_references = set()
for i in range(0, self.last_child_number):
- detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text)
+ if isinstance(self.file.paragraphs[i], str):
+ detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i])
+ else:
+ detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text)
if detected_references:
for reference in detected_references:
for one_part in re.split(r'таблиц[аеыу]| ', reference):
diff --git a/app/main/parser.py b/app/main/parser.py
index 5cf671de..b185af4c 100644
--- a/app/main/parser.py
+++ b/app/main/parser.py
@@ -5,6 +5,7 @@
from main.presentations import PresentationPPTX
from main.reports.docx_uploader import DocxUploader
+from main.reports.md_uploader import MdUpload
from utils import convert_to
logger = logging.getLogger('root_logger')
@@ -19,15 +20,24 @@ def parse(filepath, pdf_filepath):
logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
new_filepath = convert_to(filepath, target_format='pptx')
file_object = PresentationPPTX(new_filepath)
- elif tmp_filepath.endswith(('.doc', '.odt', '.docx')):
+ elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
new_filepath = filepath
if tmp_filepath.endswith(('.doc', '.odt')):
logger.info(f"Отчёт {filepath} старого формата. Временно преобразован в docx для обработки.")
new_filepath = convert_to(filepath, target_format='docx')
+
docx = DocxUploader()
docx.upload(new_filepath, pdf_filepath)
docx.parse()
file_object = docx
+
+ elif tmp_filepath.endswith('.md' ):
+ new_filepath = filepath
+ doc = MdUpload(new_filepath)
+ md_text = doc.upload()
+ doc.parse(md_text)
+ file_object = doc
+
else:
raise ValueError("Файл с недопустимым именем или недопустимого формата: " + filepath)
# Если была конвертация, то удаляем временный файл.
diff --git a/app/main/reports/README.md b/app/main/reports/README.md
index 4c591479..d54bd128 100644
--- a/app/main/reports/README.md
+++ b/app/main/reports/README.md
@@ -66,11 +66,6 @@ Proof-of-concept парсинг файлов `.docx` с выводом стру
$ python3 -m app.main.mse22.pdf_document text_from_pages --filename path_to_file
```
-## `MD`
-
-Парсинг файлов `.md` с выводом структуры файла в текстовом виде в stdout.
-
```bash
$ python3 -m app.main.reports.md_uploader md_parser --mdfile path_to_md_file
-```
-
+```
\ No newline at end of file
diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py
new file mode 100644
index 00000000..a67600f0
--- /dev/null
+++ b/app/main/reports/document_uploader.py
@@ -0,0 +1,35 @@
+from abc import ABC, abstractmethod
+
+class DocumentUploader(ABC):
+
+ @abstractmethod
+ def upload(self):
+ pass
+
+ @abstractmethod
+ def parse(self):
+ pass
+
+ @abstractmethod
+ def parse_effective_styles(self):
+ pass
+
+ @abstractmethod
+ def page_counter(self):
+ pass
+
+ @abstractmethod
+ def make_headers(self, work_type):
+ pass
+
+ @abstractmethod
+ def make_chapters(self, work_type):
+ pass
+
+ @abstractmethod
+ def find_header_page(self, work_type):
+ pass
+
+ @abstractmethod
+ def find_literature_vkr(self, work_type):
+ pass
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index 14efdc05..93e5e7a3 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -9,9 +9,10 @@
from .style import Style
from .table import Table, Cell
from ..pdf_document.pdf_document_manager import PdfDocumentManager
+from ..document_uploader import DocumentUploader
-class DocxUploader:
+class DocxUploader(DocumentUploader):
def __init__(self):
self.inline_shapes = []
self.core_properties = None
diff --git a/app/main/reports/md_uploader/md_uploader.py b/app/main/reports/md_uploader/md_uploader.py
index 926052ea..25346ffc 100644
--- a/app/main/reports/md_uploader/md_uploader.py
+++ b/app/main/reports/md_uploader/md_uploader.py
@@ -1,42 +1,206 @@
+'''Available checks for md-file:
+pack "BaseReportCriterionPackMd"
+
+[
+ [
+ "simple_check"
+ ],
+ [
+ "banned_words_in_literature"
+ ],
+ [
+ "page_counter"
+ ],
+ [
+ "short_sections_check"
+ ],
+ [
+ "banned_words_check"
+ ],
+ [
+ "right_words_check"
+ ],
+ [
+ "banned_words_in_literature"
+ ],
+ [
+ "literature_references"
+ ],
+ [
+ "image_references"
+ ],
+ [
+ "table_references"
+ ],
+ [
+ "first_pages_check"
+ ],
+ [
+ "main_character_check"
+ ],
+ [
+ "needed_headers_check"
+ ],
+ [
+ "report_section_component"
+ ],
+ [
+ "spelling_check"
+ ]
+]
+'''
+
import markdown #installation: pip install markdown
+from md2pdf.core import md2pdf #installation: pip install md2pdf
import re
+# from functools import reduce
+from PIL import Image
+from io import BytesIO
+import requests
+
+# from ..docx_uploader.inline_shape import InlineShape
+from ..document_uploader import DocumentUploader
+from ..pdf_document.pdf_document_manager import PdfDocumentManager
+
-class MdUpload:
+class MdUpload(DocumentUploader):
def __init__(self, path_to_md_file):
+ self.pdf_file = None
self.path_to_md_file = path_to_md_file
+ self.paragraphs = []
+ self.headers_main = []
self.headers = []
self.chapters = []
- self.paragraphs = []
self.html_text = ''
+ self.count = 0
self.tables = []
self.chapter_with_text = []
+ self.literature_header = []
+ self.headers_page = 1
+ self.styled_paragraphs = []
+ self.first_lines = []
+ self.inline_shapes = []
- def read_md_file(self):
+ def upload(self):
with open(self.path_to_md_file, "r", encoding="utf-8") as f:
md_text = f.read()
return md_text
- def get_html_from_md(self, md_text):
- self.html_text = markdown.markdown(md_text)
- self.paragraphs = self.html_text.split('\n')
+ def parse(self, md_text):
+ self.html_text = markdown.markdown(md_text)
+ self.paragraphs = self.make_paragraphs(self.html_text)
+ self.parse_effective_styles()
+ self.pdf_filepath = self.path_to_md_file.split('.')[0]+'.pdf'
+ self.pdf_file = PdfDocumentManager(self.path_to_md_file, md2pdf(self.pdf_filepath, md_file_path=self.path_to_md_file))
+
+ def make_paragraphs(self, html_text):
+ self.paragraphs = html_text.split('\n')
+ return self.paragraphs
+
+ def page_counter(self):
+ if not self.count:
+ for k, v in self.pdf_file.text_on_page.items():
+ line = v[:20] if len(v) > 21 else v
+ if re.search('ПРИЛОЖЕНИЕ [А-Я]', line.strip()):
+ break
+ self.count += 1
+ line = ''
+ lines = v.split("\n")
+ for i in range(len(lines)):
+ if i > 1:
+ break
+ if i > 0:
+ line += " "
+ line += lines[i].strip()
+ self.first_lines.append(line.lower())
+ return self.count
+
+ def get_main_headers(self):
+ header_main_regex = "(.*?)<\/h1>"
+ self.headers_main = re.findall(header_main_regex, self.html_text)
- def get_headers(self):
- header_regex = "(.*?)<\/h1>"
- self.headers = re.findall(header_regex, self.html_text)
+ def make_headers(self, work_type):
+ if not self.headers:
+ if work_type == 'VKR':
+ # find first pages
+ headers = [
+ {"name": "Титульный лист", "marker": False, "key": "санкт-петербургский государственный",
+ "main_character": True, "page": 0},
+ {"name": "Задание на выпускную квалификационную работу", "marker": False, "key": "задание",
+ "main_character": True, "page": 0},
+ {"name": "Календарный план", "marker": False, "key": "календарный план", "main_character": True,
+ "page": 0},
+ {"name": "Реферат", "marker": False, "key": "реферат", "main_character": False, "page": 0},
+ {"name": "Abstract", "marker": False, "key": "abstract", "main_character": False, "page": 0},
+ {"name": "Содержание", "marker": False, "key": "содержание", "main_character": False, "page": 0}]
+ for page in range(1, self.count if self.page_counter() < 2 * len(headers) else 2 * len(headers)):
+ page_text = (self.pdf_file.get_text_on_page()[page].lower())
+ for i in range(len(headers)):
+ if not headers[i]["marker"]:
+ if page_text.find(headers[i]["key"]) >= 0:
+ headers[i]["marker"] = True
+ headers[i]["page"] = page
+ break
+ self.headers = headers
+ return self.headers
- def get_chapters(self):
- chapter_regex = "(.*?)<\/h2>"
- self.chapters = re.findall(chapter_regex, self.html_text)
+ def parse_effective_styles(self):
+ for par in self.paragraphs:
+ if len(par.strip()) > 0:
+ paragraph = {"text": par, "runs": []}
+ if '' in paragraph["text"]:
+ paragraph["runs"].append({"text": par, "style": "heading 2"})
+ elif 'Таблица' in paragraph["text"]:
+ if '|' in self.paragraphs[self.paragraphs.index(par)+1]:
+ paragraph['runs'].append({"text": par, "style": "вкр_подпись таблицы"})
+ elif '
= 0:
+ par_num += 1
+ self.chapters[header_ind]["child"].append(
+ {"style": style_name, "text": self.styled_paragraphs[par_ind]["text"],
+ "styled_text": self.styled_paragraphs[par_ind], "number": head_par_ind})
+ return self.chapters
- def get_chapter_with_text(self):
- text = self.html_text
- chapter_name = ''
- for chapter in self.chapters:
- self.split_chapter = text.split("" + chapter + "
")
- self.chapter_with_text.append(chapter_name + self.split_chapter[-2])
- chapter_name = chapter
- text = self.split_chapter[-1]
- self.chapter_with_text.append(chapter_name + text)
+ def find_images(self):
+ total_height = 0
+ images = [k['runs'][0]['text'] for k in self.styled_paragraphs if k['runs'][0]['style'] == 'рисунок']
+ images_regex = '(https://[\S]+\.(jpg|png))+'
+ images_links = [re.findall(images_regex, k)[0][0] for k in images if re.findall(images_regex, k)]
+ for link in images_links:
+ response = requests.get(link)
+ image = Image.open(BytesIO(response.content))
+ dpi_image = image.info.get("dpi", (72, 72))
+ width, height = round((image.width/dpi_image[0])*2.54, 3), round((image.height/dpi_image[1])*2.54, 3)
+ total_height += width
+ self.inline_shapes.append((width, height))
+ return self.inline_shapes
def get_tables_size(self):
count_table_line = 0
@@ -46,16 +210,28 @@ def get_tables_size(self):
count_table_line +=1
return round(count_table_line/count_paragraph, 4)
+ def find_literature_vkr(self, work_type):
+ if not self.literature_header:
+ for header in self.make_chapters(work_type):
+ header_text = header["text"].lower()
+ if header_text.find('список использованных источников') >= 0:
+ self.literature_header = header
+ return self.literature_header
+
+ def find_header_page(self, work_type):
+ return self.headers_page
+
def parse_md_file(self):
- md_text = self.read_md_file()
- self.get_html_from_md(md_text)
- self.get_headers()
- self.get_chapters()
- self.get_chapter_with_text()
+ md_text = self.upload()
+ self.parse(md_text)
+ self.make_headers(work_type="VKR")
self.get_tables_size()
- return f"Заголовки:\n{self.headers}\n\nГлавы:\n{self.chapters}\n\nГлавы с текстом:\n{self.chapter_with_text}\n\nДоля таблиц в тексте:\n{self.get_tables_size()}"
+ self.make_chapters(work_type="VKR")
+ self.find_images()
+ self.find_literature_vkr(work_type="VKR")
+ return f"Заголовки:\n{self.headers_main}\n\nГлавы\n{self.chapters}\n\nИзображения:\n\n{self.inline_shapes}"
+
def main(args):
md_file = MdUpload(args.mdfile)
print(md_file.parse_md_file())
-
diff --git a/app/main/reports/pdf_document/pdf_document_manager.py b/app/main/reports/pdf_document/pdf_document_manager.py
index ddc125e0..1500756f 100644
--- a/app/main/reports/pdf_document/pdf_document_manager.py
+++ b/app/main/reports/pdf_document/pdf_document_manager.py
@@ -4,7 +4,7 @@
class PdfDocumentManager:
- def __init__(self, path_to_file, pdf_filepath=''):
+ def __init__(self, path_to_file, pdf_filepath):
if not pdf_filepath:
self.pdf_file = pdfplumber.open(convert_to(path_to_file, target_format='pdf'))
else:
diff --git a/app/server.py b/app/server.py
index 1bfe9127..edbe6284 100644
--- a/app/server.py
+++ b/app/server.py
@@ -34,8 +34,9 @@
UPLOAD_FOLDER = '/usr/src/project/files'
ALLOWED_EXTENSIONS = {
'pres': {'ppt', 'pptx', 'odp'},
- 'report': {'doc', 'odt', 'docx'}
+ 'report': {'doc', 'odt', 'docx', 'md'}
}
+
DOCUMENT_TYPES = {'Лабораторная работа', 'Курсовая работа', 'ВКР'}
TABLE_COLUMNS = ['Solution', 'User', 'File', 'Criteria', 'Check added', 'LMS date', 'Score']
URL_DOMEN = os.environ.get('URL_DOMEN', f"http://localhost:{os.environ.get('WEB_PORT', 8080)}")
diff --git a/app/utils/check_file.py b/app/utils/check_file.py
index c559ae54..309abfd7 100644
--- a/app/utils/check_file.py
+++ b/app/utils/check_file.py
@@ -3,9 +3,13 @@
def check_file(file, file_extension, allowed_extensions, check_mime=True):
if not file_extension in allowed_extensions:
return "not_allowed_extension"
+
+ if check_mime:
+ if file_extension == 'md':
+ if file.mimetype != 'text/plain':
+ return "mime_type_does_not_match_extension"
+ else:
+ if file_extension != filetype.guess_extension(file):
+ return "mime_type_does_not_match_extension"
- # Проверяем MIME тип (библиотека автоматически умеет переводить MIME в реальное расширение файла).
- if check_mime and file_extension != filetype.guess_extension(file):
- return "mime_type_does_not_match_extension"
-
- return "ok"
\ No newline at end of file
+ return "ok"
diff --git a/requirements.txt b/requirements.txt
index 0d87cf6c..226de495 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,4 +27,5 @@ pdfplumber==0.6.1
pytest~=7.1.2
filetype==1.2.0
language-tool-python==2.7.1
+md2pdf==1.0.1
markdown==3.4.4