hangeulbada · ssoxong · Nov 27, 2024 · Nov 27, 2024
diff --git a/crud/.ipynb_checkpoints/levenshtein-checkpoint.py b/crud/.ipynb_checkpoints/levenshtein-checkpoint.py
@@ -0,0 +1,40 @@
+# Import
+import re
+import itertools
+
+from .difficulty import decomposition
+
+
+def levenshtein_distance(str1, str2):
+    """ 레벤슈타인 거리 계산 """
+    m, n = len(str1), len(str2)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(m + 1):
+        dp[i][0] = i
+    for j in range(n + 1):
+        dp[0][j] = j
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if str1[i-1] == str2[j-1]:
+                dp[i][j] = dp[i-1][j-1]
+            else:
+                dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
+
+    return dp[m][n]
+
+
+def jamo_similarity(word1, word2):
+    """ 한국어 자음 모음 유사도 계산 """
+    # 자모 분리
+    jamo1 = list(itertools.chain.from_iterable(decomposition(word1)))
+    jamo2 = list(itertools.chain.from_iterable(decomposition(word2)))
+
+    print(jamo1)
+    print(jamo2)
+
+    # 자모 유사도와 초성 유사도 계산
+    jamo_similarity = 1 - (levenshtein_distance(jamo1, jamo2) / max(len(jamo1), len(jamo2)))
+
+    return jamo_similarity
diff --git a/crud/.ipynb_checkpoints/ocr-checkpoint.py b/crud/.ipynb_checkpoints/ocr-checkpoint.py
@@ -51,6 +51,7 @@ def group_text_by_coord(texts, coordinates, y_threshold=40):
 
 
 def text_preprocess(infer_text, first_coord, coord, y_thres):
+    """ Text Preprocessing """
     number_count = 0
     number_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20']
     output_text = []
@@ -96,6 +97,7 @@ def text_preprocess(infer_text, first_coord, coord, y_thres):
 
 
 def infer_ocr(filepath): # `filepath` is S3 Path
+    """ Inference OCR Using Image File Path """
     # Initialize EasyOCR Reader
     reader = easyocr.Reader(
         ['ko'], 
@@ -105,10 +107,10 @@ def infer_ocr(filepath): # `filepath` is S3 Path
         download_enabled=False,
     )
 
-    # OCR 수행
+    # Inference OCR
     result = reader.readtext(filepath, width_ths=0.2)
 
-    # Confidence Threshold 값 정의
+    # Define Confidence Threshold
     conf_thres = 0.1
 
     coord = []

diff --git a/crud/levenshtein.py b/crud/levenshtein.py
@@ -0,0 +1,40 @@
+# Import
+import re
+import itertools
+
+from .difficulty import decomposition
+
+
+def levenshtein_distance(str1, str2):
+    """ 레벤슈타인 거리 계산 """
+    m, n = len(str1), len(str2)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(m + 1):
+        dp[i][0] = i
+    for j in range(n + 1):
+        dp[0][j] = j
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if str1[i-1] == str2[j-1]:
+                dp[i][j] = dp[i-1][j-1]
+            else:
+                dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
+
+    return dp[m][n]
+
+
+def jamo_similarity(word1, word2):
+    """ 한국어 자음 모음 유사도 계산 """
+    # 자모 분리
+    jamo1 = list(itertools.chain.from_iterable(decomposition(word1)))
+    jamo2 = list(itertools.chain.from_iterable(decomposition(word2)))
+
+    print(jamo1)
+    print(jamo2)
+
+    # 자모 유사도와 초성 유사도 계산
+    jamo_similarity = 1 - (levenshtein_distance(jamo1, jamo2) / max(len(jamo1), len(jamo2)))
+
+    return jamo_similarity
diff --git a/crud/ocr.py b/crud/ocr.py
@@ -51,6 +51,7 @@ def group_text_by_coord(texts, coordinates, y_threshold=40):
 
 
 def text_preprocess(infer_text, first_coord, coord, y_thres):
+    """ Text Preprocessing """
     number_count = 0
     number_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20']
     output_text = []
@@ -96,6 +97,7 @@ def text_preprocess(infer_text, first_coord, coord, y_thres):
 
 
 def infer_ocr(filepath): # `filepath` is S3 Path
+    """ Inference OCR Using Image File Path """
     # Initialize EasyOCR Reader
     reader = easyocr.Reader(
         ['ko'], 
@@ -105,10 +107,10 @@ def infer_ocr(filepath): # `filepath` is S3 Path
         download_enabled=False,
     )
 
-    # OCR 수행
+    # Inference OCR
     result = reader.readtext(filepath, width_ths=0.2)
 
-    # Confidence Threshold 값 정의
+    # Define Confidence Threshold
     conf_thres = 0.1
 
     coord = []