Skip to content

Commit

Permalink
Merge pull request #321 from zeeguu/320-small-error-in-russian-formula
Browse files Browse the repository at this point in the history
FK Updates
  • Loading branch information
mircealungu authored Feb 7, 2025
2 parents 7130df6 + 214603c commit c0fb2a3
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 4 deletions.
130 changes: 130 additions & 0 deletions tools/get_lang_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# coding=utf-8

import zeeguu.core
from zeeguu.core.model import Article, Language
import stanza
from zeeguu.api.app import create_app
import numpy as np
from tqdm import tqdm
import os
from pprint import pprint
import pyphen


app = create_app()
app.app_context().push()

db_session = zeeguu.core.model.db.session
TOTAL_ARTCILE_SAMPLE = 500
LANGUAGES_TO_CALCULATE_STATS_FOR = [
"da",
"nl",
"en",
"fr",
"de",
"hu",
"it",
"no",
"pl",
"pt",
"ru",
"es",
"sv",
]
RESULTS = {}


def token_len_sent_len_vec(lang_code):
lang_stats = RESULTS[lang_code]
np_token_length_list = np.array(lang_stats["token_length_list"])
np_token_syl_list = np.array(lang_stats["token_syllables"])
np_sentence_length_list = np.array(lang_stats["sentence_length_list"])
return np.array(
(
np_token_length_list.mean(),
np_token_syl_list.mean(),
np_sentence_length_list.mean(),
)
)


def print_stats_for_lang(lang_code):
lang_stats = RESULTS[lang_code]
np_token_length_list = np.array(lang_stats["token_length_list"])
np_sentence_length_list = np.array(lang_stats["sentence_length_list"])
np_token_syl_list = np.array(lang_stats["token_syllables"])
print("#" * 10 + f" Results for {lang_code} " + "#" * 10)
print(
f"Token AVG Length: {np_token_length_list.mean():.2f}, std: {np_token_length_list.std():.2f}"
)
print(
f"Token Syllable AVG Length: {np_token_syl_list.mean():.2f}, std: {np_token_syl_list.std():.2f}"
)
print(
f"Sentence AVG Length: {np_sentence_length_list.mean():.2f}, std: {np_sentence_length_list.std():.2f}"
)
print(
f"Unique tokens: {lang_stats["unique_vocab"]} out of a total of {lang_stats["total_tokens"]}"
)
dist_to_other_languages = []
lang_code_vec = token_len_sent_len_vec(lang_code)
for code in LANGUAGES_TO_CALCULATE_STATS_FOR:
if code == lang_code:
continue
other_lang_vec = token_len_sent_len_vec(code)
dist = np.linalg.norm(lang_code_vec - other_lang_vec)
dist_to_other_languages.append((f"{lang_code}-{code}", dist))
dist_to_other_languages.sort(key=lambda x: x[1])
pprint(dist_to_other_languages)


for lang_code in LANGUAGES_TO_CALCULATE_STATS_FOR:
language = Language.find_or_create(lang_code)
nlp = stanza.Pipeline(lang=lang_code, processors="tokenize")
target_ids = np.array(
[
a_id[0]
for a_id in db_session.query(Article.id)
.filter(Article.language_id == language.id)
.all()
]
)
print("Got articles for language, total: ", len(target_ids))
sampled_ids = np.random.choice(
target_ids, min(TOTAL_ARTCILE_SAMPLE, len(target_ids)), replace=False
)
print("Starting calculation of stats")
stats = {
"token_length_list": [],
"sentence_length_list": [],
"token_syllables": [],
"total_tokens": 0,
}
unique_vocab = set()
for a_id in tqdm(sampled_ids, total=len(sampled_ids)):
article = Article.find_by_id(a_id)
doc = nlp(article.content)
for sent in doc.sentences:
stats["sentence_length_list"].append(len(sent.tokens))
for token in sent.tokens:
text = token.text
stats["token_length_list"].append(len(text))
unique_vocab.add(text)
stats["total_tokens"] += 1
pyphen_lang = lang_code
if pyphen_lang == "pt":
pyphen_lang = "pt_PT"
if pyphen_lang == "no":
pyphen_lang = "nb"
dic = pyphen.Pyphen(lang=pyphen_lang)
syllables = len(dic.positions(text)) + 1
stats["token_syllables"].append(syllables)

stats["unique_vocab"] = len(unique_vocab)
RESULTS[lang_code] = stats

os.system("cls" if os.name == "nt" else "clear")
for lang_code in LANGUAGES_TO_CALCULATE_STATS_FOR:
print_stats_for_lang(lang_code)
print()
print()
Original file line number Diff line number Diff line change
Expand Up @@ -74,17 +74,17 @@ def get_constants_for_language(cls, language: "language"):
# Constants & references https://github.com/Yoast/YoastSEO.js/issues/267#issue-132433796
# INFO: es/it/nl use ( syllables / 100 words ) instead of ( total syllables / total words )
# Multiplying the constants by 100 should approximate this
if language.code == "es":
if language.code in ["es", "pt"]:
return {"start": 206.84, "sentence": 1.02, "word": 60}
if language.code == "it":
return {"start": 217, "sentence": 1.3, "word": 60}
if language.code == "nl":
return {"start": 206.84, "sentence": 0.93, "word": 77}
if language.code == "fr":
return {"start": 207, "sentence": 1.015, "word": 73.6}
if language.code == "ru":
return {"start": 206.835, "sentence": 1.3, "word": 73.6}
if language.code in ["de", "pl", "da"]:
if language.code in ["ru", "pl"]:
return {"start": 206.835, "sentence": 1.3, "word": 60.1}
if language.code in ["de", "da"]:
return {"start": 180, "sentence": 1, "word": 58.5}
return {"start": 206.835, "sentence": 1.015, "word": 84.6}

Expand Down

0 comments on commit c0fb2a3

Please sign in to comment.