Skip to content

Commit

Permalink
Trying something crazy...
Browse files Browse the repository at this point in the history
  • Loading branch information
meyersbs committed Oct 17, 2016
1 parent 6fdc2b3 commit 35bd57d
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 29 deletions.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='SPLAT-library',
version='0.3.6',
version='0.3.7',
description='Speech Processing & Linguistic Analysis Tool',
long_description="SPLAT is a command-line application designed to make it easy for linguists (both computer-oriented and non-computer-oriented) to use the Natural Language Tool Kit (NLTK) for analyzing virtually any text file.\n\nSPLAT is designed to help you gather linguistic features from text files and it is assumed that most input files will not be already annotated. In order for SPLAT to function properly, you should ensure that the input files that you provide do not contain any annotations. Because there are so many variations of linguistic annotation schemes, it would simply be impossible to account for all of them in the initial parsing of input files; it is easier for you to remove any existing annotations than it is for me to do so.",
url='http://splat-library.org',
Expand All @@ -23,7 +23,7 @@
'splat.taggers',
'splat.tokenizers'
],
download_url='https://github.com/meyersbs/SPLAT/archive/v0.3.6.tar.gz',
download_url='https://github.com/meyersbs/SPLAT/archive/v0.3.7.tar.gz',
requires=['matplotlib', 'nltk', 'jsonpickle'],
classifiers=[
'Development Status :: 3 - Alpha',
Expand Down
37 changes: 10 additions & 27 deletions splat/SPLAT.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,7 @@ class SPLAT:
# Frequency Distribution Variables
__freq_dist = None

# Object Declarations
__ngramminator = FullNGramminator()
__cleantokenizer = CleanTokenizer()
__rawtokenizer = RawTokenizer()
__sentenizer = CleanSentenizer()
__postagger = NLTKPOSTagger()
__treestring_gen = TreeStringParser()

def __init__(self, text, ngramminator=FullNGramminator(), postagger=NLTKPOSTagger()):
def __init__(self, text):
"""
Creates a SPLAT Object.
"""
Expand All @@ -89,23 +81,15 @@ def __init__(self, text, ngramminator=FullNGramminator(), postagger=NLTKPOSTagge
raise ValueError("WARNING: SPLAT must be of type str or file.")

self.__uttcount = len(self.__utterances)
self.__sentences = self.__sentenizer.sentenize(self.__splat)
self.__sentences = CleanSentenizer().sentenize(self.__splat)
if self.__sentences == []: self.__sentences = self.__utterances
self.__sentcount = len(self.__sentences)
self.__rawtokens = self.__rawtokenizer.tokenize(self.__splat)
self.__tokens = self.__cleantokenizer.tokenize(self.__splat)
self.__rawtokens = RawTokenizer().tokenize(self.__splat)
self.__tokens = CleanTokenizer().tokenize(self.__splat)
self.__rawtypes = Util.typify(self.__rawtokens)
self.__types = Util.typify(self.__tokens)
self.__wordcount = Util.wordcount(self.__rawtokens)
self.__unique_wordcount = Util.wordcount(self.__types)
if ngramminator == {}:
self.__ngramminator = FullNGramminator()
else:
self.__ngramminator = ngramminator
if postagger == {}:
self.__postagger = NLTKPOSTagger()
else:
self.__postagger = postagger
self.__ttr = Util.type_token_ratio(self.__types, self.__tokens)
self.__alu = round(float(self.__wordcount) / float(self.__uttcount), 4) if self.__uttcount != 0 else 0.0
self.__als = round(float(self.__wordcount) / float(self.__sentcount), 4) if self.__sentcount != 0 else 0.0
Expand Down Expand Up @@ -309,23 +293,23 @@ def shortest_words(self):
def unigrams(self):
""" Returns a list of unigrams. """
if self.__unigrams is None:
self.__unigrams = self.__ngramminator.unigrams(self.__splat)
self.__unigrams = FullNGramminator().unigrams(self.__splat)
return self.__unigrams
else:
return self.__unigrams

def bigrams(self):
""" Returns a list of bigrams. """
if self.__bigrams is None:
self.__bigrams = self.__ngramminator.bigrams(self.__splat)
self.__bigrams = FullNGramminator().bigrams(self.__splat)
return self.__bigrams
else:
return self.__bigrams

def trigrams(self):
""" Returns a list of trigrams. """
if self.__trigrams is None:
self.__trigrams = self.__ngramminator.trigrams(self.__splat)
self.__trigrams = FullNGramminator().trigrams(self.__splat)
return self.__trigrams
else:
return self.__trigrams
Expand All @@ -341,14 +325,14 @@ def ngrams(self, n):
elif n == 3:
return self.trigrams()
else:
return self.__ngramminator.ngrams(self.__splat, n)
return FullNGramminator().ngrams(self.__splat, n)

##### PART-OF-SPEECH BASED #########################################################################################

def pos(self):
""" Returns a list of tuple pairs: (word, POS taggers). """
if self.__pos is None:
self.__pos = self.__postagger.tag(self.__splat)
self.__pos = NLTKPOSTagger().tag(self.__splat)
return self.__pos

def content_function_ratio(self):
Expand Down Expand Up @@ -392,8 +376,7 @@ def pos_counts(self):
def treestrings(self):
""" Returns a list of parsers trees. """
if self.__treestrings is None:
self.__treestring_gen = TreeStringParser()
self.__treestrings = self.__treestring_gen.get_parse_trees(self.__utterances)
self.__treestrings = TreeStringParser().get_parse_trees(self.__utterances)
return self.__treestrings

def drawtrees(self):
Expand Down
Binary file removed tests/disfluency_sample.txt.splat
Binary file not shown.
Binary file removed tests/roark_sample.txt.splat
Binary file not shown.
Binary file removed tests/whitman_test.txt.splat
Binary file not shown.

0 comments on commit 35bd57d

Please sign in to comment.