sync with Google drive files

shun-lin · Oct 20, 2019 · 2cd9a21 · 2cd9a21
1 parent 6168608
commit 2cd9a21
Show file tree

Hide file tree

Showing 21 changed files with 237,741 additions and 4,402 deletions.
diff --git a/BERT_Evaluation.ipynb b/BERT_Evaluation.ipynb
diff --git a/ColabTesting.ipynb b/ColabTesting.ipynb
diff --git a/Data Processing.ipynb b/Data Processing.ipynb
diff --git a/GPT.ipynb b/GPT.ipynb
diff --git a/GPT_output_jokes.docx b/GPT_output_jokes.docx
diff --git a/GPT_with_validation.ipynb b/GPT_with_validation.ipynb
diff --git a/LM_output_jokes.txt b/LM_output_jokes.txt
diff --git a/LSTM_output_jokes.txt b/LSTM_output_jokes.txt
diff --git a/Project Lion Poster.pdf b/Project Lion Poster.pdf
diff --git a/Project Lion Project Report.docx b/Project Lion Project Report.docx
diff --git a/QA Jokes.ipynb b/QA Jokes.ipynb
diff --git a/Simple RNN Test.ipynb b/Simple RNN Test.ipynb
diff --git a/capita.py b/capita.py
@@ -0,0 +1,44 @@
+# Capita is a small library that preprocesses capitalization from text
+# and can undo this pre-processing (through the unprocess_capitalization).
+# It is used in the Summarization notebook, to process raw text into words which can then be tokenized and numerized.
+# You do not need to modify this file.
+
+
+from segtok import tokenizer
+
+def preprocess_capitalization(text):
+    words = tokenizer.word_tokenizer(text)
+    final_words = []
+    for word in words:
+        if not word.isalpha():
+            final_words.append(word.lower())
+        else:
+            if word.islower():
+                pass
+            elif word.isupper():
+                final_words.append("⇧")
+            elif word[0].isupper() and word[1:].islower():
+                final_words.append("↑")
+            else:
+                final_words.append("↑")
+
+            final_words.append(word.lower())
+    return " ".join(final_words)
+
+def unprocess_capitalization(text):
+    words = text.split(" ")
+    final_words = []
+    all_caps = False; capitalized = False
+    for w in words:
+        if w == "⇧": all_caps = True
+        elif w == "↑": capitalized = True
+        else:
+            final_word = w
+            if all_caps: final_word = final_word.upper()
+            elif capitalized:
+                if len(final_word) <= 1: final_word = final_word.upper()
+                else: final_word = final_word[0].upper()+final_word[1:]
+            final_words.append(final_word)
+            all_caps = False; capitalized = False
+
+    return " ".join(final_words)
diff --git a/jokes.txt b/jokes.txt