Skip to content

Commit

Permalink
update link in example notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
liye committed Apr 19, 2019
1 parent 44de7ea commit 0c32a65
Showing 1 changed file with 55 additions and 32 deletions.
87 changes: 55 additions & 32 deletions examples/converting_movie_corpus.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -70,16 +70,17 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# replace the directory with where your downloaded cornell movie dialogs corpus is saved\n",
"data_dir = \"../../data_collection/cornell_movie_dialogs_corpus/\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -89,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -112,7 +113,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -128,7 +129,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 24,
"metadata": {},
"outputs": [
{
Expand All @@ -145,7 +146,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 25,
"metadata": {},
"outputs": [
{
Expand All @@ -158,7 +159,7 @@
" 'credit_pos': '4'}"
]
},
"execution_count": 36,
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -194,7 +195,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -204,36 +205,42 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 134,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 304713/304713 [00:02<00:00, 109601.49it/s]\n"
"100%|██████████| 304713/304713 [00:07<00:00, 41637.13it/s]\n"
]
}
],
"source": [
"utterance_corpus = {}\n",
"\n",
"count = 0\n",
"for utterance in tqdm(utterance_data):\n",
" \n",
" utterance_info = [info.strip() for info in utterance.split(\"+++$+++\")]\n",
" \n",
" # ignoring character name since User object already has information\n",
" idx, user, movie_id, text = utterance_info[0], utterance_info[1], utterance_info[2], utterance_info[4]\n",
" \n",
" meta = {'movie_id': movie_id}\n",
" \n",
" if count % 2 == 0:\n",
" meta = {'movie_id': movie_id}\n",
" else:\n",
" meta = {'movie_id': movie_id}\n",
" count += 1\n",
" \n",
" # root & reply_to will be updated later, timestamp is not applicable \n",
" utterance_corpus[idx] = Utterance(idx, corpus_users[user], None, None, None, text, meta=meta)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 135,
"metadata": {},
"outputs": [
{
Expand All @@ -242,7 +249,7 @@
"304713"
]
},
"execution_count": 39,
"execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -260,16 +267,16 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Utterance({'id': 'L1044', 'user': User([('name', 'u2')]), 'root': None, 'reply_to': None, 'timestamp': None, 'text': 'They do to!', 'meta': {'movie_id': 'm0'}})"
"Utterance({'id': 'L1044', 'user': User([('name', 'u2')]), 'root': None, 'reply_to': None, 'timestamp': None, 'text': 'They do to!', 'meta': {'movie_id': 'm0', 'test': []}})"
]
},
"execution_count": 40,
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -295,7 +302,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -305,7 +312,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -314,14 +321,14 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 136,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 83097/83097 [00:03<00:00, 26163.31it/s]\n"
"100%|██████████| 83097/83097 [00:02<00:00, 28463.86it/s]\n"
]
}
],
Expand Down Expand Up @@ -359,16 +366,16 @@
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Utterance({'id': 'L666499', 'user': User([('name', 'u9028')]), 'root': 'L666497', 'reply_to': 'L666498', 'timestamp': None, 'text': 'How quickly can you move your artillery forward?', 'meta': {'movie_id': 'm616'}})"
"Utterance({'id': 'L666499', 'user': User([('name', 'u9028')]), 'root': 'L666497', 'reply_to': 'L666498', 'timestamp': None, 'text': 'How quickly can you move your artillery forward?', 'meta': {'movie_id': 'm616', 'test': []}})"
]
},
"execution_count": 45,
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -394,7 +401,7 @@
},
{
"cell_type": "code",
"execution_count": 47,
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -403,13 +410,20 @@
},
{
"cell_type": "code",
"execution_count": 54,
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"movie_corpus = Corpus(utterances=utterance_list, version=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -484,7 +498,7 @@
"metadata": {},
"outputs": [],
"source": [
"from parser import Parser"
"from convokit import Parser"
]
},
{
Expand Down Expand Up @@ -662,18 +676,27 @@
"metadata": {},
"source": [
"#### Saving created datasets\n",
"To complete the final step of dataset conversion, we want to save the dataset such that it can be loaded later for reuse. You may want to specify a name. The default location to find the saved datasets will be __./convokit/saved-copora__ in your home directory. "
"To complete the final step of dataset conversion, we want to save the dataset such that it can be loaded later for reuse. You may want to specify a name. The default location to find the saved datasets will be __./convokit/saved-copora__ in your home directory, but you can also specify where you want the saved corpora to be. "
]
},
{
"cell_type": "code",
"execution_count": 66,
"execution_count": 145,
"metadata": {},
"outputs": [],
"source": [
"# movie_corpus.dump(\"movie-corpus\", base_path = <specify where you prefer to save it to>)\n",
"# the following would save the Corpus to the default location\n",
"movie_corpus.dump(\"movie-corpus\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -687,7 +710,7 @@
"metadata": {},
"outputs": [],
"source": [
"from util import meta_index"
"from convokit import meta_index"
]
},
{
Expand Down Expand Up @@ -732,7 +755,7 @@
"source": [
"### Other ways of conversion\n",
"\n",
"The above method is only one way to convert the dataset. Alternatively, one may follow strictly with the specifications of the expected data format described [here](https://convokit.cornell.edu/) and write out the component files directly. "
"The above method is only one way to convert the dataset. Alternatively, one may follow strictly with the specifications of the expected data format described [here](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/doc/source/data_format.rst) and write out the component files directly. "
]
},
{
Expand All @@ -759,7 +782,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
"version": "3.6.5"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 0c32a65

Please sign in to comment.