update link in example notebook

CornellNLP · Apr 19, 2019 · 0c32a65 · 0c32a65
1 parent 44de7ea
commit 0c32a65
Showing 1 changed file with 55 additions and 32 deletions.
diff --git a/examples/converting_movie_corpus.ipynb b/examples/converting_movie_corpus.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 66,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -70,16 +70,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# replace the directory with where your downloaded cornell movie dialogs corpus is saved\n",
     "data_dir = \"../../data_collection/cornell_movie_dialogs_corpus/\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -89,7 +90,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -112,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -128,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -145,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -158,7 +159,7 @@
        " 'credit_pos': '4'}"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -194,7 +195,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -204,36 +205,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 134,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 304713/304713 [00:02<00:00, 109601.49it/s]\n"
+      "100%|██████████| 304713/304713 [00:07<00:00, 41637.13it/s]\n"
      ]
     }
    ],
    "source": [
     "utterance_corpus = {}\n",
     "\n",
+    "count = 0\n",
     "for utterance in tqdm(utterance_data):\n",
     "    \n",
     "    utterance_info = [info.strip() for info in utterance.split(\"+++$+++\")]\n",
     "    \n",
     "    # ignoring character name since User object already has information\n",
     "    idx, user, movie_id, text = utterance_info[0], utterance_info[1], utterance_info[2], utterance_info[4]\n",
     "    \n",
-    "    meta = {'movie_id': movie_id}\n",
+    "    \n",
+    "    if count % 2 == 0:\n",
+    "        meta = {'movie_id': movie_id}\n",
+    "    else:\n",
+    "        meta = {'movie_id': movie_id}\n",
+    "    count += 1\n",
     "    \n",
     "    # root & reply_to will be updated later, timestamp is not applicable \n",
     "    utterance_corpus[idx] = Utterance(idx, corpus_users[user], None, None, None, text, meta=meta)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 135,
    "metadata": {},
    "outputs": [
     {
@@ -242,7 +249,7 @@
        "304713"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 135,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -260,16 +267,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 88,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "Utterance({'id': 'L1044', 'user': User([('name', 'u2')]), 'root': None, 'reply_to': None, 'timestamp': None, 'text': 'They do to!', 'meta': {'movie_id': 'm0'}})"
+       "Utterance({'id': 'L1044', 'user': User([('name', 'u2')]), 'root': None, 'reply_to': None, 'timestamp': None, 'text': 'They do to!', 'meta': {'movie_id': 'm0', 'test': []}})"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 88,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -295,7 +302,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 89,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -305,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 90,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -314,14 +321,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 136,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 83097/83097 [00:03<00:00, 26163.31it/s]\n"
+      "100%|██████████| 83097/83097 [00:02<00:00, 28463.86it/s]\n"
      ]
     }
    ],
@@ -359,16 +366,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 92,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "Utterance({'id': 'L666499', 'user': User([('name', 'u9028')]), 'root': 'L666497', 'reply_to': 'L666498', 'timestamp': None, 'text': 'How quickly can you move your artillery forward?', 'meta': {'movie_id': 'm616'}})"
+       "Utterance({'id': 'L666499', 'user': User([('name', 'u9028')]), 'root': 'L666497', 'reply_to': 'L666498', 'timestamp': None, 'text': 'How quickly can you move your artillery forward?', 'meta': {'movie_id': 'm616', 'test': []}})"
       ]
      },
-     "execution_count": 45,
+     "execution_count": 92,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -394,7 +401,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 137,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -403,13 +410,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 138,
    "metadata": {},
    "outputs": [],
    "source": [
     "movie_corpus = Corpus(utterances=utterance_list, version=1)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -484,7 +498,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from parser import Parser"
+    "from convokit import Parser"
    ]
   },
   {
@@ -662,18 +676,27 @@
    "metadata": {},
    "source": [
     "#### Saving created datasets\n",
-    "To complete the final step of dataset conversion, we want to save the dataset such that it can be loaded later for reuse. You may want to specify a name. The default location to find the saved datasets will be __./convokit/saved-copora__ in your home directory.  "
+    "To complete the final step of dataset conversion, we want to save the dataset such that it can be loaded later for reuse. You may want to specify a name. The default location to find the saved datasets will be __./convokit/saved-copora__ in your home directory, but you can also specify where you want the saved corpora to be. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 145,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# movie_corpus.dump(\"movie-corpus\", base_path = <specify where you prefer to save it to>)\n",
+    "# the following would save the Corpus to the default location\n",
     "movie_corpus.dump(\"movie-corpus\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -687,7 +710,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from util import meta_index"
+    "from convokit import meta_index"
    ]
   },
   {
@@ -732,7 +755,7 @@
    "source": [
     "### Other ways of conversion\n",
     "\n",
-    "The above method is only one way to convert the dataset. Alternatively, one may follow strictly with the specifications of the expected data format described [here](https://convokit.cornell.edu/) and write out the component files directly. "
+    "The above method is only one way to convert the dataset. Alternatively, one may follow strictly with the specifications of the expected data format described [here](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/doc/source/data_format.rst) and write out the component files directly. "
    ]
   },
   {
@@ -759,7 +782,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.2"
+   "version": "3.6.5"
   }
  },
  "nbformat": 4,