Merge remote-tracking branch 'origin/master'

PhonologicalCorpusTools · Mar 5, 2019 · d938eb7 · d938eb7
2 parents c133a3f + 7a8b0b5
commit d938eb7
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 19 deletions.
diff --git a/corpustools/corpus/io/csv.py b/corpustools/corpus/io/csv.py
@@ -87,6 +87,75 @@ def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None):
 
     return atts, best
 
+def check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types=None, feature_system_path=None,
+                               stop_check=None, call_back=None):
+
+    if feature_system_path is not None and os.path.exists(feature_system_path):
+        feature_matrix = load_binary(feature_system_path)
+        feature_matrix = modernize.modernize_specifier(feature_matrix)
+
+    if annotation_types is None:
+        annotation_types, delimiter = inspect_csv(path, coldelim=delimiter)
+
+    for a in annotation_types:
+        a.reset()
+
+    missing = set()
+
+    with open(path, encoding='utf-8-sig') as f:
+        headers = f.readline()
+        headers = headers.split(delimiter)
+        if len(headers) == 1:
+            e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches '
+                                'the one used in the file.'))
+            raise e
+        headers = annotation_types
+
+        for line in f.readlines():
+            line = line.strip()
+            if not line:
+                continue
+
+            for k, v in zip(headers, line.split(delimiter)):
+                v = v.strip()
+                if k.attribute.att_type == 'tier':
+                    ignored = k.ignored_characters
+                    if ignored is not None:
+                        v = ''.join(x for x in v if x not in ignored)
+
+                    sd = k.syllable_delimiter
+                    if sd is not None:
+                        syllables = v.split(sd)
+                    else:
+                        syllables = [v]
+
+                    td = k.trans_delimiter
+                    stress_spec = set(k.stress_specification.keys())
+                    tone_spec = set(k.tone_specification.keys())
+                    supra_spec = stress_spec.union(tone_spec)
+                    for syllable in syllables:
+                        syllable = ''.join(x for x in syllable if x not in supra_spec)
+
+                        if td is None:
+                            if k.digraph_pattern is not None:
+                                string = k.digraph_pattern.findall(syllable)
+                            else:
+                                string = [x for x in syllable]
+                        else:
+                            string = syllable.split(td)
+
+                        for seg in string:
+                            if seg == '':
+                                continue
+
+                            if seg not in feature_matrix.segments:
+                                missing.add(seg)
+
+    print('In csv.py', missing)
+
+
+
+
 def load_corpus_csv(corpus_name, path, delimiter,
                     annotation_types = None,
                     feature_system_path = None,
@@ -117,6 +186,9 @@ def load_corpus_csv(corpus_name, path, delimiter,
         Corpus object generated from the text file
 
     """
+    check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types, feature_system_path,
+                               stop_check, call_back)
+
     corpus = Corpus(corpus_name)
     if feature_system_path is not None and os.path.exists(feature_system_path):
         feature_matrix = load_binary(feature_system_path)
@@ -171,7 +243,6 @@ def load_corpus_csv(corpus_name, path, delimiter,
                     d[k.attribute.name] = (k.attribute, v)
             word = Word(**d)
 
-            # TODO: what is the following code doing?
             if word.transcription:
                 #transcriptions can have phonetic symbol delimiters
                 if not word.spelling:

diff --git a/corpustools/corpus/io/helper.py b/corpustools/corpus/io/helper.py
@@ -71,22 +71,22 @@ def __init__(self, syllable, feature_matrix, annotation_type, begin=None, end=No
                     self.tone = symbol
                 else:
                     seg += symbol
-            
-            for i, j in enumerate(feature_matrix.features):
-                if j == 'syllabic':
-                    index_for_syllabic = i + 1
-            try:
-                if feature_matrix.seg_to_feat_line(seg)[index_for_syllabic] == "-":  # not syllabic
-                    if is_nucleus:
-                        self.coda.append(BaseAnnotation(seg))
-                    else:
-                        self.onset.append(BaseAnnotation(seg))
-                else:  # syllabic
-                    is_nucleus = True
-                    self.nucleus.append(BaseAnnotation(seg))
-            except KeyError as e:
-                e = MissingFeatureError('The feature values for {} is not specified.'.format(seg))
-                raise e
+
+                    for i, j in enumerate(feature_matrix.features):
+                        if j == 'syllabic':
+                            index_for_syllabic = i + 1
+                    try:
+                        if feature_matrix.seg_to_feat_line(seg)[index_for_syllabic] == "-":  # not syllabic
+                            if is_nucleus:
+                                self.coda.append(BaseAnnotation(seg))
+                            else:
+                                self.onset.append(BaseAnnotation(seg))
+                        else:  # syllabic
+                            is_nucleus = True
+                            self.nucleus.append(BaseAnnotation(seg))
+                    except KeyError as e:
+                        e = MissingFeatureError('The feature values for {} is not specified.'.format(seg))
+                        raise e
 
     def __iter__(self):
         segs = list()
@@ -502,8 +502,9 @@ def parse_transcription(string, annotation_type, feature_matrix=None, corpus=Non
 
     #final_string = []
 
-    corpus.inventory.stress_types = annotation_type.stress_specification
-    corpus.inventory.tone_types = annotation_type.tone_specification
+    if corpus is not None:
+        corpus.inventory.stress_types = annotation_type.stress_specification
+        corpus.inventory.tone_types = annotation_type.tone_specification
 
     #sd = annotation_type.syllable_delimiter
 

diff --git a/setup.py b/setup.py
@@ -56,6 +56,7 @@ def run_tests(self):
           'numpy',
           'scipy',
           'textgrid'
+          'pyqt'
           #'python-acoustic-similarity'
       ],
       entry_points = {