Fixed error in silent pause counting.

meyersbs · Jan 19, 2017 · cbbccfa · cbbccfa
1 parent 1f645b4
commit cbbccfa
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 12 deletions.
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='SPLAT-library',
-    version='0.4.0',
+    version='0.4.1',
     description='Speech Processing & Linguistic Analysis Tool',
     long_description="SPLAT is a command-line application designed to make it easy for linguists (both computer-oriented and non-computer-oriented) to use the Natural Language Tool Kit (NLTK) for analyzing virtually any text file.\n\nSPLAT is designed to help you gather linguistic features from text files and it is assumed that most input files will not be already annotated. In order for SPLAT to function properly, you should ensure that the input files that you provide do not contain any annotations. Because there are so many variations of linguistic annotation schemes, it would simply be impossible to account for all of them in the initial parsing of input files; it is easier for you to remove any existing annotations than it is for me to do so.",
     url='http://splat-library.org',
@@ -23,7 +23,7 @@
         'splat.taggers',
         'splat.tokenizers'
     ],
-    download_url='https://github.com/meyersbs/SPLAT/archive/v0.4.0.tar.gz',
+    download_url='https://github.com/meyersbs/SPLAT/archive/v0.4.1.tar.gz',
     requires=['matplotlib', 'nltk'],
     classifiers=[
         'Development Status :: 3 - Alpha',

diff --git a/splat/Util.py b/splat/Util.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 ##### PYTHON IMPORTS ###################################################################################################
-import re
+import re, string
 
 ##### NLTK IMPORTS #####################################################################################################
 from nltk.tree import Tree
@@ -187,23 +187,24 @@ def count_disfluencies(utterances):
 		words = 0
 		last_word = ""
 		for word in utt.split(" "):
+			clean_word = word.lower().rstrip(".,!?")
 			words += 1
-			if word.lower() == "um":
+			if clean_word == "um":
 				um_count += 1
-			elif word.lower() == "uh":
+			elif clean_word == "uh":
 				uh_count += 1
-			elif word.lower() == "ah":
+			elif clean_word == "ah":
 				ah_count += 1
-			elif word.lower() == "er":
+			elif clean_word == "er":
 				er_count += 1
-			elif word.lower() == "hm":
+			elif clean_word == "hm":
 				hm_count += 1
-			elif word.lower() == "{sl}":
+			elif clean_word == "{sl}":
 				pause_count += 1
-			elif word == last_word:
+			elif clean_word == last_word:
 				repetition_count += 1
-				last_word = word
-			elif re.search(r"-$", word):
+				last_word = clean_word
+			elif re.search(r"-$", clean_word):
 				break_count += 1
 			else:
 				pass