sillsdev · bhartmoore · Jan 8, 2025 · Jan 8, 2025 · Jan 9, 2025 · Jan 13, 2025
diff --git a/silnlp/common/compare_lex.py b/silnlp/common/compare_lex.py
@@ -0,0 +1,153 @@
+import argparse
+import numpy
+import re
+
+from ..common.environment import SIL_NLP_ENV
+
+def get_all_words(src_file: str) -> list:
+    words = []
+    pattern = re.compile(r",(?=\S)")  # Look for commas with no following space
+    with open(src_file, "r", encoding = "utf8") as src_data_file:     
+        for line in src_data_file:
+            for word in line.split(" "):
+                word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
+                finder = pattern.search(word)
+                if finder:             # Add space after commas as needed
+                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
+                if word != "":
+                    words.append(word)  
+    return words
+
+def find_unique(words1: list, words2: list) -> list:
+    unique_words = []
+    for word in words1:
+        if word not in words2:
+            unique_words.append(word)
+    return unique_words
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compares unique words in two corpora")
+    parser.add_argument("exp1", help="First experiment folder from path S:\\Alignment\\experiments\\")
+    parser.add_argument("exp2", help="Second experiment folder from path S:\\Alignment\\experiments\\")
+    parser.add_argument("--stats", help="True or False: Output word count and number of renderings for common words", 
+                        action='store_true')
+    parser.add_argument("--src", help="If set, only the source side of the two experiment lexicons is compared", 
+                        action='store_true')
+    parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared", 
+                        action='store_true')
+    args = parser.parse_args()
+
+    # If not explicitly limited, compare both source and target lexicons
+    if args.src == False and args.trg == False:
+        args.src = True
+        args.trg = True
+
+    lex_path1 = SIL_NLP_ENV.align_experiments_dir / args.exp1
+    lex_path2 = SIL_NLP_ENV.align_experiments_dir / args.exp2
+
+    # Compare source words and write results to files
+    if args.src == True:
+        src_file1 = lex_path1 / "src.txt"
+        src_file2 = lex_path2 / "src.txt"
+
+        # Find all words and unique words on source side
+        src_words1 = get_all_words(src_file1)
+        unique_src_words1 = numpy.unique(numpy.array(src_words1))
+        src_words2 = get_all_words(src_file2)
+        unique_src_words2 = numpy.unique(numpy.array(src_words2))
+        src1_only_words = find_unique(unique_src_words1,unique_src_words2)
+        src2_only_words = find_unique(unique_src_words2,unique_src_words1)
+
+        # Write unique source words to files
+        src_words_file1 = lex_path1 / "src_words.txt"
+        src_words_file2 = lex_path2 / "src_words.txt"
+        with open(src_words_file1, "w", encoding="utf8") as output_file:
+            for word in unique_src_words1:
+                output_file.writelines(word+'\n')
+        with open(src_words_file2, "w", encoding="utf8") as output_file:
+            for word in unique_src_words2:
+                output_file.writelines(word+'\n')
+
+        # Write source words missing from the alternate source file
+        with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'src.txt words not found in {src_file2}\n')
+            for word in src1_only_words:
+                output_file.writelines(word+'\n')
+        with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'src.txt words not found in {src_file1}\n')
+            for word in src2_only_words:
+                output_file.writelines(word+'\n')
+
+    # Compare target words and write results to files
+    if args.trg == True:
+        trg_file1 = lex_path1 / "trg.txt"
+        trg_file2 = lex_path2 / "trg.txt"
+
+        # Find all words and unique words on target side
+        trg_words1 = get_all_words(trg_file1)
+        unique_trg_words1 = numpy.unique(numpy.array(trg_words1))
+        trg_words2 = get_all_words(trg_file2)
+        unique_trg_words2 = numpy.unique(numpy.array(trg_words2))
+        trg1_only_words = find_unique(unique_trg_words1,unique_trg_words2)
+        trg2_only_words = find_unique(unique_trg_words2,unique_trg_words1)
+
+        # Write unique target words to files
+        trg_words_file1 = lex_path1 / "trg_words.txt"
+        trg_words_file2 = lex_path2 / "trg_words.txt"
+        with open(trg_words_file1, "w", encoding="utf8") as output_file:
+            for word in unique_trg_words1:
+                output_file.writelines(word+'\n')
+        with open(trg_words_file2, "w", encoding="utf8") as output_file:
+            for word in unique_trg_words2:
+                output_file.writelines(word+'\n')
+
+        # Write target words missing from the alternate target file
+        with (lex_path1 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'trg.txt words not found in {trg_file2}\n')
+            for word in trg1_only_words:
+                output_file.writelines(word+'\n')
+        with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
+            output_file.writelines(f'src.txt words not found in {trg_file1}\n')
+            for word in trg2_only_words:
+                output_file.writelines(word+'\n')
+
+    # Write the lex coverage stats
+    with (lex_path1 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
+        if args.src == True:
+            output_file.writelines(f'Unique words in src.txt: {len(unique_src_words1)}\n')
+            output_file.writelines(
+                f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}\n')
+            output_file.writelines(f'Words missing from {src_words_file2}: {len(src1_only_words)}\n')
+        if args.trg == True:
+            output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words1)}\n')
+            output_file.writelines(
+                f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}\n')
+            output_file.writelines(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}\n')
+
+    with (lex_path2 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
+        if args.src == True:
+            output_file.writelines(f'Unique words in src.txt: {len(unique_src_words2)}\n')
+            output_file.writelines(
+                f'Words also found in {src_words_file1}: {len(unique_src_words2)-len(src2_only_words)}\n')
+            output_file.writelines(f'Words missing from {src_words_file1}: {len(src2_only_words)}\n')
+        if args.trg == True:
+            output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words2)}\n')
+            output_file.writelines(
+                f'Words also found in {trg_words_file1}: {len(unique_trg_words2)-len(trg2_only_words)}\n')
+            output_file.writelines(f'Words missing from {trg_words_file1}: {len(trg2_only_words)}\n')
+
+    # Output stats if requested
+    if args.stats == True:
+        if args.src == True:
+            print(f'Unique words in src.txt: {len(unique_src_words1)}')
+            print(f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}')
+            print(f'Words missing from {src_words_file2}: {len(src1_only_words)}')
+        if args.trg == True:
+            print(f'Unique words in {trg_words_file1}: {len(unique_trg_words1)}')
+            print(f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}')
+            print(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/silnlp/common/count_words.py b/silnlp/common/count_words.py
@@ -0,0 +1,150 @@
+import argparse
+from collections import Counter
+import csv
+import numpy
+import pandas as pd
+import re
+
+from ..common.environment import SIL_NLP_ENV 
+
+def is_word(word: str) -> bool:
+    val = False
+    for char in word:
+        if char.isalpha():
+            val = True
+    return val
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Counts lexicon entries")
+    parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
+    parser.add_argument("--aligner", help="Aligner: eflomal, fast-align, hmm", default="eflomal")
+    parser.add_argument("--num", help="Number of most common words to include", default=100)
+    parser.add_argument("--stats", help="True or False: Print word count and number of renderings for common words", 
+                        action='store_true')
+    args = parser.parse_args()
+
+    # Set up path and lex files
+    lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment
+    lex_txt_file = "lexicon."+args.aligner+".txt"
+    new_lex_txt_file = "lexicon."+args.aligner+"_clean.txt"
+
+    # Get source and target iso codes
+    with (lex_path / "config.yml").open("r", encoding="utf8") as conf:
+        for line in conf:
+            if "src" in line.split(" ")[0]:
+                src_iso = line.split(" ")[1].split("-")[0]
+            elif "trg" in line.split(" ")[0]:
+                trg_iso = line.split(" ")[1].split("-")[0]
+    # TODO: error or use a default if it fails to get both iso codes
+
+    # Look for commas with no following whitespace
+    pattern = re.compile(r",(?=\S)")  
+
+    # Pull all the separate words from the source data. Take most common and all unique.
+    src_words = []
+    with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file:     
+        for line in src_data_file:
+            for word in line.split(" "):
+                word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
+                # Add space after commas as needed
+                finder = pattern.search(word)
+                if finder:
+                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
+                if word != "" and not word.isnumeric():
+                    src_words.append(word)  
+    src_data_word_counter = Counter(src_words).most_common(args.num)
+    unique_src_words = numpy.unique(numpy.array(src_words))
+
+    # Pull all the separate words from the target data. Take all unique.
+    trg_words = [] 
+    with (lex_path / "trg.txt").open("r", encoding = "utf8") as trg_data_file:     
+        for line in trg_data_file:
+            for word in line.split(" "):
+                word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
+                # Add space after commas as needed
+                finder = pattern.search(word)
+                if finder:
+                    word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
+                if word != "" and not word.isnumeric():
+                    trg_words.append(word)  
+    unique_trg_words = numpy.unique(numpy.array(trg_words))
+
+    # Clean lexicon file and prep for pandas csv reader
+    with (lex_path / lex_txt_file).open("r", encoding="utf8") as lexicon:
+        with (lex_path / new_lex_txt_file).open("w", encoding="utf8") as new_lex:
+            for line in lexicon.readlines():
+                line = line.replace("'","\\'").replace("\"","\\\"")
+                if is_word(line.split("\t")[0]):
+                    new_lex.write(line)
+
+    # Read the lexicon into a dataframe after escaping out quotes.
+    # Find the most most diverse src words (most lexicon entries).
+    lex_df = pd.read_csv(lex_path / new_lex_txt_file, sep = '\t')
+    lex_df.columns = [src_iso, trg_iso, "percent"]
+    lex_word_counter = Counter(lex_df[src_iso]).most_common(args.num)
+
+    # Find all the renderings for the most diverse words.
+    diverse_wd={}
+    diverse_wd_renderings = 0
+    for entry in lex_word_counter:
+        diverse_wd_renderings += entry[1]
+        word = entry[0]
+        diverse_wd[word] = []
+        for index, trg_word in enumerate(lex_df[trg_iso]):
+            if word == lex_df[src_iso][index]:
+                diverse_wd[word].append(lex_df[trg_iso][index])
+
+    # Find all the renderings for the most common words.
+    common_wd={}                # Dictionary of most common src words and trg renderings
+    common_wd_instances = 0     # Instances of most common src words
+    common_wd_renderings = 0    # Cumulative trg renderings for most common src words
+    for entry in src_data_word_counter:
+        common_wd_instances += entry[1]
+        word = entry[0]
+        common_wd[word] = []
+        for index, trg_word in enumerate(lex_df[trg_iso]):
+            if word == lex_df[src_iso][index]:
+                common_wd[word].append(lex_df[trg_iso][index])
+    for renderings in common_wd.values():
+        common_wd_renderings += len(renderings)
+
+    # Write the dictionary of renderings for the most common words to a .csv file in the experiment directory.
+    with open(f"{lex_path}\\trg_renderings.csv", "w", encoding="utf8") as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow([f'{src_iso} Word', f'{trg_iso} Words'])
+        for src_wd in common_wd:
+            writer.writerow([src_wd, *common_wd[src_wd]])
+
+    with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file:
+        for word in unique_src_words:
+            output_file.writelines(word + '\n')
+
+    # Optionally, output a few stats
+    if args.stats:
+        print(f"\nSource Data: {len(src_words)} words and {len(unique_src_words)} unique words.")
+        print(f"Target Data: {len(trg_words)} words and {len(unique_trg_words)} unique words.")
+        print(f"\nThe {args.num} most common source words in the dataset appear an average of "
+              f"{round((common_wd_instances)/args.num)} times per word.")
+        print(f"The {args.num} most common source words in the dataset have an average of "
+              f"{round((common_wd_renderings)/args.num)} target renderings per word.")
+        print(f"The {args.num} most diverse source words in the dataset have an average of "
+              f"{round((diverse_wd_renderings)/args.num)} renderings per word.\n")
+
+
+    # Print the "score" and write to file
+    score = round(100*(1/((common_wd_renderings)/args.num)))
+    print(f"Internal Consistency score is {score}.\n  Score of 100 indicates that the top source words "
+          f"each average 1 target rendering.\n  Score = 100*1/(average trg_words per most common src_word)")
+
+    with (lex_path / "lex_stats.csv").open("w", encoding="utf8") as stats_file:
+        writer = csv.writer(stats_file)
+        writer.writerow(['#_Src_wds','#_trg_wds','Unique_src','Unique_trg','Num_top_wds','Avg_inst',
+                         'avg_trg_renderings','avg_diverse_renderings'])
+        writer.writerow([len(src_words),len(trg_words),len(unique_src_words),len(unique_trg_words),args.num,
+                         round((common_wd_instances)/args.num),round((common_wd_renderings)/args.num),
+                         round((diverse_wd_renderings)/args.num)]) 
+
+
+if __name__ == "__main__":
+    main()
+