Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lexicon tools for use with XRI datasets #621

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 153 additions & 0 deletions silnlp/common/compare_lex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import argparse
import numpy
import re

from ..common.environment import SIL_NLP_ENV

def get_all_words(src_file: str) -> list:
words = []
pattern = re.compile(r",(?=\S)") # Look for commas with no following space
with open(src_file, "r", encoding = "utf8") as src_data_file:
for line in src_data_file:
for word in line.split(" "):
word = word.strip().strip("\'\"\\;,:.!?()-[]0123456789").lower()
finder = pattern.search(word)
if finder: # Add space after commas as needed
word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
if word != "":
words.append(word)
return words

def find_unique(words1: list, words2: list) -> list:
unique_words = []
for word in words1:
if word not in words2:
unique_words.append(word)
return unique_words


def main() -> None:
parser = argparse.ArgumentParser(description="Compares unique words in two corpora")
parser.add_argument("exp1", help="First experiment folder from path S:\\Alignment\\experiments\\")
parser.add_argument("exp2", help="Second experiment folder from path S:\\Alignment\\experiments\\")
parser.add_argument("--stats", help="True or False: Output word count and number of renderings for common words",
action='store_true')
parser.add_argument("--src", help="If set, only the source side of the two experiment lexicons is compared",
action='store_true')
parser.add_argument("--trg", help="If set, only the target side of the two experiment lexicons is compared",
action='store_true')
args = parser.parse_args()

# If not explicitly limited, compare both source and target lexicons
if args.src == False and args.trg == False:
args.src = True
args.trg = True

lex_path1 = SIL_NLP_ENV.align_experiments_dir / args.exp1
lex_path2 = SIL_NLP_ENV.align_experiments_dir / args.exp2

# Compare source words and write results to files
if args.src == True:
src_file1 = lex_path1 / "src.txt"
src_file2 = lex_path2 / "src.txt"

# Find all words and unique words on source side
src_words1 = get_all_words(src_file1)
unique_src_words1 = numpy.unique(numpy.array(src_words1))
src_words2 = get_all_words(src_file2)
unique_src_words2 = numpy.unique(numpy.array(src_words2))
src1_only_words = find_unique(unique_src_words1,unique_src_words2)
src2_only_words = find_unique(unique_src_words2,unique_src_words1)

# Write unique source words to files
src_words_file1 = lex_path1 / "src_words.txt"
src_words_file2 = lex_path2 / "src_words.txt"
with open(src_words_file1, "w", encoding="utf8") as output_file:
for word in unique_src_words1:
output_file.writelines(word+'\n')
with open(src_words_file2, "w", encoding="utf8") as output_file:
for word in unique_src_words2:
output_file.writelines(word+'\n')

# Write source words missing from the alternate source file
with (lex_path1 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
output_file.writelines(f'src.txt words not found in {src_file2}\n')
for word in src1_only_words:
output_file.writelines(word+'\n')
with (lex_path2 / "unmatched_src_words.txt").open("w", encoding="utf8") as output_file:
output_file.writelines(f'src.txt words not found in {src_file1}\n')
for word in src2_only_words:
output_file.writelines(word+'\n')

# Compare target words and write results to files
if args.trg == True:
trg_file1 = lex_path1 / "trg.txt"
trg_file2 = lex_path2 / "trg.txt"

# Find all words and unique words on target side
trg_words1 = get_all_words(trg_file1)
unique_trg_words1 = numpy.unique(numpy.array(trg_words1))
trg_words2 = get_all_words(trg_file2)
unique_trg_words2 = numpy.unique(numpy.array(trg_words2))
trg1_only_words = find_unique(unique_trg_words1,unique_trg_words2)
trg2_only_words = find_unique(unique_trg_words2,unique_trg_words1)

# Write unique target words to files
trg_words_file1 = lex_path1 / "trg_words.txt"
trg_words_file2 = lex_path2 / "trg_words.txt"
with open(trg_words_file1, "w", encoding="utf8") as output_file:
for word in unique_trg_words1:
output_file.writelines(word+'\n')
with open(trg_words_file2, "w", encoding="utf8") as output_file:
for word in unique_trg_words2:
output_file.writelines(word+'\n')

# Write target words missing from the alternate target file
with (lex_path1 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
output_file.writelines(f'trg.txt words not found in {trg_file2}\n')
for word in trg1_only_words:
output_file.writelines(word+'\n')
with (lex_path2 / "unmatched_trg_words.txt").open("w", encoding="utf8") as output_file:
output_file.writelines(f'src.txt words not found in {trg_file1}\n')
for word in trg2_only_words:
output_file.writelines(word+'\n')

# Write the lex coverage stats
with (lex_path1 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
if args.src == True:
output_file.writelines(f'Unique words in src.txt: {len(unique_src_words1)}\n')
output_file.writelines(
f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}\n')
output_file.writelines(f'Words missing from {src_words_file2}: {len(src1_only_words)}\n')
if args.trg == True:
output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words1)}\n')
output_file.writelines(
f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}\n')
output_file.writelines(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}\n')

with (lex_path2 / "lex_coverage.txt").open("a", encoding="utf8") as output_file:
if args.src == True:
output_file.writelines(f'Unique words in src.txt: {len(unique_src_words2)}\n')
output_file.writelines(
f'Words also found in {src_words_file1}: {len(unique_src_words2)-len(src2_only_words)}\n')
output_file.writelines(f'Words missing from {src_words_file1}: {len(src2_only_words)}\n')
if args.trg == True:
output_file.writelines(f'Unique words in trg.txt: {len(unique_trg_words2)}\n')
output_file.writelines(
f'Words also found in {trg_words_file1}: {len(unique_trg_words2)-len(trg2_only_words)}\n')
output_file.writelines(f'Words missing from {trg_words_file1}: {len(trg2_only_words)}\n')

# Output stats if requested
if args.stats == True:
if args.src == True:
print(f'Unique words in src.txt: {len(unique_src_words1)}')
print(f'Words also found in {src_words_file2}: {len(unique_src_words1)-len(src1_only_words)}')
print(f'Words missing from {src_words_file2}: {len(src1_only_words)}')
if args.trg == True:
print(f'Unique words in {trg_words_file1}: {len(unique_trg_words1)}')
print(f'Words also found in {trg_words_file2}: {len(unique_trg_words1)-len(trg1_only_words)}')
print(f'Words missing from {trg_words_file2}: {len(trg1_only_words)}')


if __name__ == "__main__":
main()
150 changes: 150 additions & 0 deletions silnlp/common/count_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import argparse
from collections import Counter
import csv
import numpy
import pandas as pd
import re

from ..common.environment import SIL_NLP_ENV

def is_word(word: str) -> bool:
val = False
for char in word:
if char.isalpha():
val = True
return val

def main() -> None:
parser = argparse.ArgumentParser(description="Counts lexicon entries")
parser.add_argument("experiment", help="Experiment folder from path S:\\Alignment\\experiments\\")
parser.add_argument("--aligner", help="Aligner: eflomal, fast-align, hmm", default="eflomal")
parser.add_argument("--num", help="Number of most common words to include", default=100)
parser.add_argument("--stats", help="True or False: Print word count and number of renderings for common words",
action='store_true')
args = parser.parse_args()

# Set up path and lex files
lex_path = SIL_NLP_ENV.align_experiments_dir / args.experiment
lex_txt_file = "lexicon."+args.aligner+".txt"
new_lex_txt_file = "lexicon."+args.aligner+"_clean.txt"

# Get source and target iso codes
with (lex_path / "config.yml").open("r", encoding="utf8") as conf:
for line in conf:
if "src" in line.split(" ")[0]:
src_iso = line.split(" ")[1].split("-")[0]
elif "trg" in line.split(" ")[0]:
trg_iso = line.split(" ")[1].split("-")[0]
# TODO: error or use a default if it fails to get both iso codes

# Look for commas with no following whitespace
pattern = re.compile(r",(?=\S)")

# Pull all the separate words from the source data. Take most common and all unique.
src_words = []
with (lex_path / "src.txt").open("r", encoding = "utf8") as src_data_file:
for line in src_data_file:
for word in line.split(" "):
word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
# Add space after commas as needed
finder = pattern.search(word)
if finder:
word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
if word != "" and not word.isnumeric():
src_words.append(word)
src_data_word_counter = Counter(src_words).most_common(args.num)
unique_src_words = numpy.unique(numpy.array(src_words))

# Pull all the separate words from the target data. Take all unique.
trg_words = []
with (lex_path / "trg.txt").open("r", encoding = "utf8") as trg_data_file:
for line in trg_data_file:
for word in line.split(" "):
word = word.strip().strip("\'\"\\;,:.!?()-[]").lower()
# Add space after commas as needed
finder = pattern.search(word)
if finder:
word = word[:finder.span()[1]]+" "+word[finder.span()[1]:]
if word != "" and not word.isnumeric():
trg_words.append(word)
unique_trg_words = numpy.unique(numpy.array(trg_words))

# Clean lexicon file and prep for pandas csv reader
with (lex_path / lex_txt_file).open("r", encoding="utf8") as lexicon:
with (lex_path / new_lex_txt_file).open("w", encoding="utf8") as new_lex:
for line in lexicon.readlines():
line = line.replace("'","\\'").replace("\"","\\\"")
if is_word(line.split("\t")[0]):
new_lex.write(line)

# Read the lexicon into a dataframe after escaping out quotes.
# Find the most most diverse src words (most lexicon entries).
lex_df = pd.read_csv(lex_path / new_lex_txt_file, sep = '\t')
lex_df.columns = [src_iso, trg_iso, "percent"]
lex_word_counter = Counter(lex_df[src_iso]).most_common(args.num)

# Find all the renderings for the most diverse words.
diverse_wd={}
diverse_wd_renderings = 0
for entry in lex_word_counter:
diverse_wd_renderings += entry[1]
word = entry[0]
diverse_wd[word] = []
for index, trg_word in enumerate(lex_df[trg_iso]):
if word == lex_df[src_iso][index]:
diverse_wd[word].append(lex_df[trg_iso][index])

# Find all the renderings for the most common words.
common_wd={} # Dictionary of most common src words and trg renderings
common_wd_instances = 0 # Instances of most common src words
common_wd_renderings = 0 # Cumulative trg renderings for most common src words
for entry in src_data_word_counter:
common_wd_instances += entry[1]
word = entry[0]
common_wd[word] = []
for index, trg_word in enumerate(lex_df[trg_iso]):
if word == lex_df[src_iso][index]:
common_wd[word].append(lex_df[trg_iso][index])
for renderings in common_wd.values():
common_wd_renderings += len(renderings)

# Write the dictionary of renderings for the most common words to a .csv file in the experiment directory.
with open(f"{lex_path}\\trg_renderings.csv", "w", encoding="utf8") as csv_file:
writer = csv.writer(csv_file)
writer.writerow([f'{src_iso} Word', f'{trg_iso} Words'])
for src_wd in common_wd:
writer.writerow([src_wd, *common_wd[src_wd]])

with (lex_path / "src_words.txt").open("w", encoding = "utf8") as output_file:
for word in unique_src_words:
output_file.writelines(word + '\n')

# Optionally, output a few stats
if args.stats:
print(f"\nSource Data: {len(src_words)} words and {len(unique_src_words)} unique words.")
print(f"Target Data: {len(trg_words)} words and {len(unique_trg_words)} unique words.")
print(f"\nThe {args.num} most common source words in the dataset appear an average of "
f"{round((common_wd_instances)/args.num)} times per word.")
print(f"The {args.num} most common source words in the dataset have an average of "
f"{round((common_wd_renderings)/args.num)} target renderings per word.")
print(f"The {args.num} most diverse source words in the dataset have an average of "
f"{round((diverse_wd_renderings)/args.num)} renderings per word.\n")


# Print the "score" and write to file
score = round(100*(1/((common_wd_renderings)/args.num)))
print(f"Internal Consistency score is {score}.\n Score of 100 indicates that the top source words "
f"each average 1 target rendering.\n Score = 100*1/(average trg_words per most common src_word)")

with (lex_path / "lex_stats.csv").open("w", encoding="utf8") as stats_file:
writer = csv.writer(stats_file)
writer.writerow(['#_Src_wds','#_trg_wds','Unique_src','Unique_trg','Num_top_wds','Avg_inst',
'avg_trg_renderings','avg_diverse_renderings'])
writer.writerow([len(src_words),len(trg_words),len(unique_src_words),len(unique_trg_words),args.num,
round((common_wd_instances)/args.num),round((common_wd_renderings)/args.num),
round((diverse_wd_renderings)/args.num)])


if __name__ == "__main__":
main()