diff --git a/notebooks/00_quick_start/README.md b/notebooks/00_quick_start/README.md index 2d968665a8..685252abdd 100644 --- a/notebooks/00_quick_start/README.md +++ b/notebooks/00_quick_start/README.md @@ -15,7 +15,7 @@ data preparation, model building, and model evaluation by using the utility func | [rbm](rbm_movielens.ipynb)| MovieLens | Python CPU, GPU | Utilizing the Restricted Boltzmann Machine (rbm) [4] to predict movie ratings in a Python+GPU (TensorFlow) environment.
| [rlrmc](rlrmc_movielens.ipynb) | Movielens | Python CPU | Utilizing the Riemannian Low-rank Matrix Completion (RLRMC) [6] to predict movie rating in a Python+CPU environment | [sar](sar_movielens.ipynb) | MovieLens | Python CPU | Utilizing Simple Algorithm for Recommendation (SAR) algorithm to predict movie ratings in a Python+CPU environment. -| [sar_azureml](sar_movielens_with_azureml.ipynb)| MovieLens | Python CPU | An example of how to utilize and evaluate SAR using the [Azure Machine Learning service](https://docs.microsoft.com/azure/machine-learning/service/overview-what-is-azure-ml)(AzureML). It takes the content of the [sar quickstart notebook](sar_movielens.ipynb) and demonstrates how to use the power of the cloud to manage data, switch to powerful GPU machines, and monitor runs while training a model. +| [sar_azureml](sar_movielens_with_azureml.ipynb)| MovieLens | Python CPU | An example of how to utilize and evaluate SAR using the [Azure Machine Learning service](https://docs.microsoft.com/azure/machine-learning/service/overview-what-is-azure-ml) (AzureML). It takes the content of the [sar quickstart notebook](sar_movielens.ipynb) and demonstrates how to use the power of the cloud to manage data, switch to powerful GPU machines, and monitor runs while training a model. | [wide-and-deep](wide_deep_movielens.ipynb) | MovieLens | Python CPU, GPU | Utilizing Wide-and-Deep Model (Wide-and-Deep) [5] to predict movie ratings in a Python+GPU (TensorFlow) environment. | [xdeepfm](xdeepfm_criteo.ipynb) | Criteo, Synthetic Data | Python CPU, GPU | Utilizing the eXtreme Deep Factorization Machine (xDeepFM) [3] to learn both low and high order feature interactions for predicting CTR, in a Python+GPU (TensorFlow) environment. diff --git a/notebooks/00_quick_start/sar_movielens.ipynb b/notebooks/00_quick_start/sar_movielens.ipynb index fc36437be4..d8d19df340 100644 --- a/notebooks/00_quick_start/sar_movielens.ipynb +++ b/notebooks/00_quick_start/sar_movielens.ipynb @@ -41,16 +41,16 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", + "System version: 3.7.3 | packaged by conda-forge | (default, Jul 1 2019, 21:52:21) \n", "[GCC 7.3.0]\n", - "Pandas version: 0.24.1\n" + "Pandas version: 0.23.4\n" ] } ], @@ -60,12 +60,11 @@ "sys.path.append(\"../../\")\n", "\n", "import logging\n", - "import time\n", - "\n", "import numpy as np\n", "import pandas as pd\n", "import papermill as pm\n", "\n", + "from reco_utils.common.timer import Timer\n", "from reco_utils.dataset import movielens\n", "from reco_utils.dataset.python_splitters import python_stratified_split\n", "from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n", @@ -91,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 3, "metadata": { "tags": [ "parameters" @@ -115,14 +114,14 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "4.93MB [00:01, 3.46MB/s] \n" + "100%|██████████| 4.81k/4.81k [00:02<00:00, 1.90kKB/s]\n" ] }, { @@ -201,7 +200,7 @@ "4 166 346 1.0 886397596" ] }, - "execution_count": 73, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -228,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -237,7 +236,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -299,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -334,73 +333,47 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 8, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2019-05-07 11:16:40,709 INFO Collecting user affinity matrix\n", - "2019-05-07 11:16:40,715 INFO Calculating time-decayed affinities\n", - "2019-05-07 11:16:40,766 INFO Creating index columns\n", - "2019-05-07 11:16:40,782 INFO Building user affinity sparse matrix\n", - "2019-05-07 11:16:40,787 INFO Calculating item co-occurrence\n", - "2019-05-07 11:16:40,910 INFO Calculating item similarity\n", - "2019-05-07 11:16:40,910 INFO Using jaccard based similarity\n", - "2019-05-07 11:16:40,990 INFO Done training\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "Took 0.284792423248291 seconds for training.\n" + "Took 0.3302565817721188 seconds for training.\n" ] } ], "source": [ - "start_time = time.time()\n", - "\n", - "model.fit(train)\n", + "with Timer() as train_time:\n", + " model.fit(train)\n", "\n", - "train_time = time.time() - start_time\n", - "print(\"Took {} seconds for training.\".format(train_time))" + "print(\"Took {} seconds for training.\".format(train_time.interval))" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 9, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2019-05-07 11:16:41,003 INFO Calculating recommendation scores\n", - "2019-05-07 11:16:41,114 INFO Removing seen items\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "Took 0.1463017463684082 seconds for prediction.\n" + "Took 0.21034361701458693 seconds for prediction.\n" ] } ], "source": [ - "start_time = time.time()\n", - "\n", - "top_k = model.recommend_k_items(test, remove_seen=True)\n", + "with Timer() as test_time:\n", + " top_k = model.recommend_k_items(test, remove_seen=True)\n", "\n", - "test_time = time.time() - start_time\n", - "print(\"Took {} seconds for prediction.\".format(test_time))" + "print(\"Took {} seconds for prediction.\".format(test_time.interval))" ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 10, "metadata": { "scrolled": true }, @@ -435,32 +408,32 @@ " \n", " 0\n", " 1\n", - " 58\n", - " 3.049881\n", + " 204\n", + " 3.313306\n", " \n", " \n", " 1\n", " 1\n", - " 7\n", - " 3.053073\n", + " 89\n", + " 3.280465\n", " \n", " \n", " 2\n", " 1\n", - " 318\n", - " 3.059262\n", + " 11\n", + " 3.233867\n", " \n", " \n", " 3\n", " 1\n", - " 210\n", - " 3.095604\n", + " 367\n", + " 3.192575\n", " \n", " \n", " 4\n", " 1\n", - " 96\n", - " 3.124997\n", + " 423\n", + " 3.131517\n", " \n", " \n", "\n", @@ -468,19 +441,20 @@ ], "text/plain": [ " userID itemID prediction\n", - "0 1 58 3.049881\n", - "1 1 7 3.053073\n", - "2 1 318 3.059262\n", - "3 1 210 3.095604\n", - "4 1 96 3.124997" + "0 1 204 3.313306\n", + "1 1 89 3.280465\n", + "2 1 11 3.233867\n", + "3 1 367 3.192575\n", + "4 1 423 3.131517" ] }, + "execution_count": 10, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "display(top_k.head())" + "top_k.head()" ] }, { @@ -494,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -503,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -512,7 +486,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -521,7 +495,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -530,7 +504,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -557,17 +531,9 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 16, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2019-05-07 11:16:42,926 INFO Calculating recommendation scores\n", - "2019-05-07 11:16:43,033 INFO Removing seen items\n" - ] - }, { "data": { "text/html": [ @@ -650,7 +616,7 @@ "4 876 288 3.0 879428101 NaN" ] }, - "execution_count": 85, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -737,8 +703,8 @@ "pm.record(\"ndcg\", eval_ndcg)\n", "pm.record(\"precision\", eval_precision)\n", "pm.record(\"recall\", eval_recall)\n", - "pm.record(\"train_time\", train_time)\n", - "pm.record(\"test_time\", test_time)" + "pm.record(\"train_time\", train_time.interval)\n", + "pm.record(\"test_time\", test_time.interval)" ] } ], @@ -759,7 +725,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/notebooks/00_quick_start/sar_movielens_with_azureml.ipynb b/notebooks/00_quick_start/sar_movielens_with_azureml.ipynb index f99e1eced6..e09110cd04 100644 --- a/notebooks/00_quick_start/sar_movielens_with_azureml.ipynb +++ b/notebooks/00_quick_start/sar_movielens_with_azureml.ipynb @@ -48,8 +48,7 @@ "- SAR does not use item or user features, so cannot handle cold-start use cases\n", "- SAR requires the creation of an $mxm$ dense matrix (where $m$ is the number of items). So memory consumption can be an issue with large numbers of items.\n", "- SAR is best used for ranking items per user, as the scale of predicted ratings may be different from the input range and will differ across users.\n", - "For more details see the deep dive notebook on SAR here: [SAR Deep Dive Notebook](../02_model/sar_deep_dive.ipynb)", - "\n", + "For more details see the deep dive notebook on SAR here: [SAR Deep Dive Notebook](../02_model/sar_deep_dive.ipynb)\n", "---\n", "## Prerequisities\n", " - **Azure Subscription**\n", @@ -60,14 +59,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "azureml.core version: 1.0.23\n" + "azureml.core version: 1.0.18\n" ] } ], @@ -78,6 +77,7 @@ "\n", "import os\n", "import shutil\n", + "import numpy as np\n", "from tempfile import TemporaryDirectory\n", "\n", "import azureml\n", @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 26, "metadata": { "tags": [ "parameters" @@ -106,7 +106,7 @@ "TOP_K = 10\n", "\n", "# Select Movielens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = '1m'" + "MOVIELENS_DATA_SIZE = '100k'" ] }, { @@ -126,19 +126,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Performing interactive authentication. Please follow the instructions on the terminal.\n", - "To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code AA9E5YB5M to authenticate.\n", - "Found the config file in: /data/home/testuser/notebooks/Recommenders/notebooks/00_quick_start/.azureml/config.json\n" - ] - } - ], + "outputs": [], "source": [ "ws = get_or_create_workspace(\n", " subscription_id=\"\",\n", @@ -158,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -178,31 +168,23 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "5.92MB [00:00, 13.8MB/s] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Uploading /tmp/tmpc98zwvek/movielens_1m_data.pkl\n", - "Uploaded /tmp/tmpc98zwvek/movielens_1m_data.pkl, 1 files out of an estimated total of 1\n" + "100%|██████████| 4.81k/4.81k [00:02<00:00, 1.98kKB/s]\n" ] }, { "data": { "text/plain": [ - "$AZUREML_DATAREFERENCE_3d32ed3550b24ea9af3ce37c4977b877" + "$AZUREML_DATAREFERENCE_57dbc7117f67479892135cec2819b78b" ] }, - "execution_count": 16, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -221,7 +203,7 @@ "data.to_pickle(os.path.join(tmp_dir.name, data_file_name))\n", "\n", "ds = ws.get_default_datastore()\n", - "ds.upload(src_dir=tmp_dir.name, target_path=TARGET_DIR, overwrite=True, show_progress=True)" + "ds.upload(src_dir=tmp_dir.name, target_path=TARGET_DIR, overwrite=True, show_progress=False)" ] }, { @@ -261,14 +243,18 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Found existing compute target\n" + "Creating a new compute target...\n", + "Creating\n", + "Succeeded\n", + "AmlCompute wait for completion finished\n", + "Minimum number of nodes requested have been provisioned\n" ] } ], @@ -309,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -330,14 +316,14 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Writing /tmp/tmpc98zwvek/movielens-sar/train.py\n" + "Writing /tmp/tmp6imrlt0z/movielens-sar/train.py\n" ] } ], @@ -350,16 +336,21 @@ "import pandas as pd\n", "import itertools\n", "import logging\n", - "import time\n", "\n", "from azureml.core import Run\n", "from sklearn.externals import joblib\n", "\n", + "from reco_utils.common.timer import Timer\n", "from reco_utils.dataset import movielens\n", - "from reco_utils.dataset.python_splitters import python_random_split\n", + "from reco_utils.dataset.python_splitters import python_stratified_split\n", "from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n", "from reco_utils.recommender.sar.sar_singlenode import SARSingleNode\n", "\n", + "\n", + "logging.basicConfig(level=logging.DEBUG, \n", + " format='%(asctime)s %(levelname)-8s %(message)s')\n", + "\n", + "\n", "TARGET_DIR = 'movielens'\n", "OUTPUT_FILE_NAME = 'outputs/movielens_sar_model.pkl'\n", "MODEL_FILE_NAME = 'movielens_sar_model.pkl'\n", @@ -376,17 +367,7 @@ "parser.add_argument('--data-size', type=str, dest='data_size', default=10, help='Movielens data size: 100k, 1m, 10m, or 20m')\n", "args = parser.parse_args()\n", "\n", - "data_pickle_path = os.path.join(args.data_folder, args.data_file)\n", - "\n", - "data = pd.read_pickle(path=data_pickle_path)\n", - "\n", - "# Log arguments to the run for tracking\n", - "run.log(\"top-k\", args.top_k)\n", - "run.log(\"data-size\", args.data_size)\n", - "\n", - "train, test = python_random_split(data)\n", - "\n", - "# instantiate the SAR algorithm and set the index\n", + "# set col names\n", "header = {\n", " \"col_user\": \"UserId\",\n", " \"col_item\": \"MovieId\",\n", @@ -394,34 +375,39 @@ " \"col_timestamp\": \"Timestamp\",\n", "}\n", "\n", - "logging.basicConfig(level=logging.DEBUG, \n", - " format='%(asctime)s %(levelname)-8s %(message)s')\n", + "# read data\n", + "data_pickle_path = os.path.join(args.data_folder, args.data_file)\n", + "data = pd.read_pickle(path=data_pickle_path)\n", "\n", + "# Log arguments to the run for tracking\n", + "run.log(\"top-k\", args.top_k)\n", + "run.log(\"data-size\", args.data_size)\n", + "\n", + "# split dataset into train and test\n", + "train, test = python_stratified_split(data, ratio=0.75, col_user=header[\"col_user\"], col_item=header[\"col_item\"], seed=42)\n", + "\n", + "# instantiate the model\n", "model = SARSingleNode(\n", - " remove_seen=True, similarity_type=\"jaccard\", \n", - " time_decay_coefficient=30, time_now=None, timedecay_formula=True, **header\n", + " similarity_type=\"jaccard\", \n", + " time_decay_coefficient=30, \n", + " time_now=None, \n", + " timedecay_formula=True, \n", + " **header\n", ")\n", "\n", "# train the SAR model\n", - "start_time = time.time()\n", + "with Timer() as t:\n", + " model.fit(train)\n", "\n", - "model.fit(train)\n", + "run.log(name=\"Training time\", value=t.interval)\n", "\n", - "train_time = time.time() - start_time\n", - "run.log(name=\"Training time\", value=train_time)\n", + "# predict top k items\n", + "with Timer() as t:\n", + " top_k = model.recommend_k_items(test, remove_seen=True)\n", "\n", - "start_time = time.time()\n", + "run.log(name=\"Prediction time\", value=t.interval)\n", "\n", - "top_k = model.recommend_k_items(test)\n", - "\n", - "test_time = time.time() - start_time\n", - "run.log(name=\"Prediction time\", value=test_time)\n", - "\n", - "# TODO: remove this call when the model returns same type as input\n", - "top_k['UserId'] = pd.to_numeric(top_k['UserId'])\n", - "top_k['MovieId'] = pd.to_numeric(top_k['MovieId'])\n", - "\n", - "# evaluate\n", + "# compute evaluation metrics\n", "eval_map = map_at_k(test, top_k, col_user=\"UserId\", col_item=\"MovieId\", \n", " col_rating=\"Rating\", col_prediction=\"prediction\", \n", " relevancy_method=\"top_k\", k=args.top_k)\n", @@ -449,16 +435,16 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'/tmp/tmpc98zwvek/movielens-sar/reco_utils'" + "'/tmp/tmp6imrlt0z/movielens-sar/reco_utils'" ] }, - "execution_count": 20, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -466,6 +452,8 @@ "source": [ "# copy dependent python files\n", "UTILS_DIR = os.path.join(SCRIPT_DIR, 'reco_utils')\n", + "if os.path.exists(UTILS_DIR):\n", + " shutil.rmtree(UTILS_DIR)\n", "shutil.copytree('../../reco_utils/', UTILS_DIR)" ] }, @@ -491,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 39, "metadata": { "tags": [ "configure estimator" @@ -526,7 +514,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -551,13 +539,13 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5119a29a82044100be0e2f47e40aef15", + "model_id": "145080f4fc0e47c8a892ff7db3f3c08b", "version_major": 2, "version_minor": 0 }, @@ -583,9 +571,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
ExperimentIdTypeStatusDetails PageDocs Page
movielens-sarmovielens-sar_1575027796_199dd2c6azureml.scriptrunCompletedLink to Azure PortalLink to Documentation
" + ], + "text/plain": [ + "Run(Experiment: movielens-sar,\n", + "Id: movielens-sar_1575027796_199dd2c6,\n", + "Type: azureml.scriptrun,\n", + "Status: Completed)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "run" ] @@ -602,21 +607,31 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 43, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'top-k': 10, 'data-size': '1m', 'Training time': 2.6481945514678955, 'Prediction time': 2.9131650924682617, 'map': 0.0023399652870897034, 'ndcg': 0.031352549193757774, 'precision': 0.038807947019867554, 'recall': 0.014086226527787116}\n" - ] + "data": { + "text/plain": [ + "{'top-k': 10,\n", + " 'data-size': '100k',\n", + " 'Training time': 0.4077951559999633,\n", + " 'Prediction time': 0.13354294300000902,\n", + " 'map': 0.11059057578638949,\n", + " 'ndcg': 0.3824612290501957,\n", + " 'precision': 0.33075291622481445,\n", + " 'recall': 0.1763854474342893}" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "# run below after run is complete, otherwise metrics is empty\n", "metrics = run.get_metrics()\n", - "print(metrics)" + "metrics" ] }, { @@ -629,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -643,21 +658,28 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "# clean up temporary directory\n", "tmp_dir.cleanup()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python 3.6 - AzureML", + "display_name": "Python (reco_base)", "language": "python", - "name": "python3-azureml" + "name": "reco_base" }, "language_info": { "codemirror_mode": { @@ -669,7 +691,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.6.8" } }, "nbformat": 4,