diff --git a/crud/.ipynb_checkpoints/levenshtein-checkpoint.py b/crud/.ipynb_checkpoints/levenshtein-checkpoint.py new file mode 100644 index 0000000..b520dd3 --- /dev/null +++ b/crud/.ipynb_checkpoints/levenshtein-checkpoint.py @@ -0,0 +1,40 @@ +# Import +import re +import itertools + +from .difficulty import decomposition + + +def levenshtein_distance(str1, str2): + """ 레벤슈타인 거리 계산 """ + m, n = len(str1), len(str2) + dp = [[0] * (n + 1) for _ in range(m + 1)] + + for i in range(m + 1): + dp[i][0] = i + for j in range(n + 1): + dp[0][j] = j + + for i in range(1, m + 1): + for j in range(1, n + 1): + if str1[i-1] == str2[j-1]: + dp[i][j] = dp[i-1][j-1] + else: + dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1 + + return dp[m][n] + + +def jamo_similarity(word1, word2): + """ 한국어 자음 모음 유사도 계산 """ + # 자모 분리 + jamo1 = list(itertools.chain.from_iterable(decomposition(word1))) + jamo2 = list(itertools.chain.from_iterable(decomposition(word2))) + + print(jamo1) + print(jamo2) + + # 자모 유사도와 초성 유사도 계산 + jamo_similarity = 1 - (levenshtein_distance(jamo1, jamo2) / max(len(jamo1), len(jamo2))) + + return jamo_similarity \ No newline at end of file diff --git a/crud/.ipynb_checkpoints/ocr-checkpoint.py b/crud/.ipynb_checkpoints/ocr-checkpoint.py index b02b380..f2e60f3 100644 --- a/crud/.ipynb_checkpoints/ocr-checkpoint.py +++ b/crud/.ipynb_checkpoints/ocr-checkpoint.py @@ -51,6 +51,7 @@ def group_text_by_coord(texts, coordinates, y_threshold=40): def text_preprocess(infer_text, first_coord, coord, y_thres): + """ Text Preprocessing """ number_count = 0 number_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20'] output_text = [] @@ -96,6 +97,7 @@ def text_preprocess(infer_text, first_coord, coord, y_thres): def infer_ocr(filepath): # `filepath` is S3 Path + """ Inference OCR Using Image File Path """ # Initialize EasyOCR Reader reader = easyocr.Reader( ['ko'], @@ -105,10 +107,10 @@ def infer_ocr(filepath): # `filepath` is S3 Path download_enabled=False, ) - # OCR 수행 + # Inference OCR result = reader.readtext(filepath, width_ths=0.2) - # Confidence Threshold 값 정의 + # Define Confidence Threshold conf_thres = 0.1 coord = [] diff --git a/crud/levenshtein.py b/crud/levenshtein.py new file mode 100644 index 0000000..b520dd3 --- /dev/null +++ b/crud/levenshtein.py @@ -0,0 +1,40 @@ +# Import +import re +import itertools + +from .difficulty import decomposition + + +def levenshtein_distance(str1, str2): + """ 레벤슈타인 거리 계산 """ + m, n = len(str1), len(str2) + dp = [[0] * (n + 1) for _ in range(m + 1)] + + for i in range(m + 1): + dp[i][0] = i + for j in range(n + 1): + dp[0][j] = j + + for i in range(1, m + 1): + for j in range(1, n + 1): + if str1[i-1] == str2[j-1]: + dp[i][j] = dp[i-1][j-1] + else: + dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1 + + return dp[m][n] + + +def jamo_similarity(word1, word2): + """ 한국어 자음 모음 유사도 계산 """ + # 자모 분리 + jamo1 = list(itertools.chain.from_iterable(decomposition(word1))) + jamo2 = list(itertools.chain.from_iterable(decomposition(word2))) + + print(jamo1) + print(jamo2) + + # 자모 유사도와 초성 유사도 계산 + jamo_similarity = 1 - (levenshtein_distance(jamo1, jamo2) / max(len(jamo1), len(jamo2))) + + return jamo_similarity \ No newline at end of file diff --git a/crud/ocr.py b/crud/ocr.py index b02b380..f2e60f3 100644 --- a/crud/ocr.py +++ b/crud/ocr.py @@ -51,6 +51,7 @@ def group_text_by_coord(texts, coordinates, y_threshold=40): def text_preprocess(infer_text, first_coord, coord, y_thres): + """ Text Preprocessing """ number_count = 0 number_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20'] output_text = [] @@ -96,6 +97,7 @@ def text_preprocess(infer_text, first_coord, coord, y_thres): def infer_ocr(filepath): # `filepath` is S3 Path + """ Inference OCR Using Image File Path """ # Initialize EasyOCR Reader reader = easyocr.Reader( ['ko'], @@ -105,10 +107,10 @@ def infer_ocr(filepath): # `filepath` is S3 Path download_enabled=False, ) - # OCR 수행 + # Inference OCR result = reader.readtext(filepath, width_ths=0.2) - # Confidence Threshold 값 정의 + # Define Confidence Threshold conf_thres = 0.1 coord = []