Merge pull request #321 from zeeguu/320-small-error-in-russian-formula

FK Updates
zeeguu · Feb 7, 2025 · c0fb2a3 · c0fb2a3
2 parents 7130df6 + 214603c
commit c0fb2a3
Show file tree

Hide file tree

Showing 2 changed files with 134 additions and 4 deletions.
diff --git a/tools/get_lang_stats.py b/tools/get_lang_stats.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+
+import zeeguu.core
+from zeeguu.core.model import Article, Language
+import stanza
+from zeeguu.api.app import create_app
+import numpy as np
+from tqdm import tqdm
+import os
+from pprint import pprint
+import pyphen
+
+
+app = create_app()
+app.app_context().push()
+
+db_session = zeeguu.core.model.db.session
+TOTAL_ARTCILE_SAMPLE = 500
+LANGUAGES_TO_CALCULATE_STATS_FOR = [
+    "da",
+    "nl",
+    "en",
+    "fr",
+    "de",
+    "hu",
+    "it",
+    "no",
+    "pl",
+    "pt",
+    "ru",
+    "es",
+    "sv",
+]
+RESULTS = {}
+
+
+def token_len_sent_len_vec(lang_code):
+    lang_stats = RESULTS[lang_code]
+    np_token_length_list = np.array(lang_stats["token_length_list"])
+    np_token_syl_list = np.array(lang_stats["token_syllables"])
+    np_sentence_length_list = np.array(lang_stats["sentence_length_list"])
+    return np.array(
+        (
+            np_token_length_list.mean(),
+            np_token_syl_list.mean(),
+            np_sentence_length_list.mean(),
+        )
+    )
+
+
+def print_stats_for_lang(lang_code):
+    lang_stats = RESULTS[lang_code]
+    np_token_length_list = np.array(lang_stats["token_length_list"])
+    np_sentence_length_list = np.array(lang_stats["sentence_length_list"])
+    np_token_syl_list = np.array(lang_stats["token_syllables"])
+    print("#" * 10 + f" Results for {lang_code} " + "#" * 10)
+    print(
+        f"Token AVG Length: {np_token_length_list.mean():.2f}, std: {np_token_length_list.std():.2f}"
+    )
+    print(
+        f"Token Syllable AVG Length: {np_token_syl_list.mean():.2f}, std: {np_token_syl_list.std():.2f}"
+    )
+    print(
+        f"Sentence AVG Length: {np_sentence_length_list.mean():.2f}, std: {np_sentence_length_list.std():.2f}"
+    )
+    print(
+        f"Unique tokens: {lang_stats["unique_vocab"]} out of a total of {lang_stats["total_tokens"]}"
+    )
+    dist_to_other_languages = []
+    lang_code_vec = token_len_sent_len_vec(lang_code)
+    for code in LANGUAGES_TO_CALCULATE_STATS_FOR:
+        if code == lang_code:
+            continue
+        other_lang_vec = token_len_sent_len_vec(code)
+        dist = np.linalg.norm(lang_code_vec - other_lang_vec)
+        dist_to_other_languages.append((f"{lang_code}-{code}", dist))
+    dist_to_other_languages.sort(key=lambda x: x[1])
+    pprint(dist_to_other_languages)
+
+
+for lang_code in LANGUAGES_TO_CALCULATE_STATS_FOR:
+    language = Language.find_or_create(lang_code)
+    nlp = stanza.Pipeline(lang=lang_code, processors="tokenize")
+    target_ids = np.array(
+        [
+            a_id[0]
+            for a_id in db_session.query(Article.id)
+            .filter(Article.language_id == language.id)
+            .all()
+        ]
+    )
+    print("Got articles for language, total: ", len(target_ids))
+    sampled_ids = np.random.choice(
+        target_ids, min(TOTAL_ARTCILE_SAMPLE, len(target_ids)), replace=False
+    )
+    print("Starting calculation of stats")
+    stats = {
+        "token_length_list": [],
+        "sentence_length_list": [],
+        "token_syllables": [],
+        "total_tokens": 0,
+    }
+    unique_vocab = set()
+    for a_id in tqdm(sampled_ids, total=len(sampled_ids)):
+        article = Article.find_by_id(a_id)
+        doc = nlp(article.content)
+        for sent in doc.sentences:
+            stats["sentence_length_list"].append(len(sent.tokens))
+            for token in sent.tokens:
+                text = token.text
+                stats["token_length_list"].append(len(text))
+                unique_vocab.add(text)
+                stats["total_tokens"] += 1
+                pyphen_lang = lang_code
+                if pyphen_lang == "pt":
+                    pyphen_lang = "pt_PT"
+                if pyphen_lang == "no":
+                    pyphen_lang = "nb"
+                dic = pyphen.Pyphen(lang=pyphen_lang)
+                syllables = len(dic.positions(text)) + 1
+                stats["token_syllables"].append(syllables)
+
+    stats["unique_vocab"] = len(unique_vocab)
+    RESULTS[lang_code] = stats
+
+os.system("cls" if os.name == "nt" else "clear")
+for lang_code in LANGUAGES_TO_CALCULATE_STATS_FOR:
+    print_stats_for_lang(lang_code)
+    print()
+    print()
diff --git a/zeeguu/core/language/strategies/flesch_kincaid_difficulty_estimator.py b/zeeguu/core/language/strategies/flesch_kincaid_difficulty_estimator.py
@@ -74,17 +74,17 @@ def get_constants_for_language(cls, language: "language"):
         # Constants & references https://github.com/Yoast/YoastSEO.js/issues/267#issue-132433796
         # INFO: es/it/nl use ( syllables / 100 words ) instead of ( total syllables / total words )
         # Multiplying the constants by 100 should approximate this
-        if language.code == "es":
+        if language.code in ["es", "pt"]:
             return {"start": 206.84, "sentence": 1.02, "word": 60}
         if language.code == "it":
             return {"start": 217, "sentence": 1.3, "word": 60}
         if language.code == "nl":
             return {"start": 206.84, "sentence": 0.93, "word": 77}
         if language.code == "fr":
             return {"start": 207, "sentence": 1.015, "word": 73.6}
-        if language.code == "ru":
-            return {"start": 206.835, "sentence": 1.3, "word": 73.6}
-        if language.code in ["de", "pl", "da"]:
+        if language.code in ["ru", "pl"]:
+            return {"start": 206.835, "sentence": 1.3, "word": 60.1}
+        if language.code in ["de", "da"]:
             return {"start": 180, "sentence": 1, "word": 58.5}
         return {"start": 206.835, "sentence": 1.015, "word": 84.6}