From 2782e16389b2fb373cae543474dd8b406441d014 Mon Sep 17 00:00:00 2001 From: djarpin Date: Thu, 9 Nov 2017 14:05:26 -0800 Subject: [PATCH] Added: PCA k-means movie clustering example notebook --- .../AmazonAIAlgorithmsIO_pb2.py | 101 ++ pca_kmeans_movie_clustering/convert_data.py | 90 ++ .../pca_kmeans_movie_clustering.ipynb | 896 ++++++++++++++++++ pca_kmeans_movie_clustering/record_pb2.py | 501 ++++++++++ 4 files changed, 1588 insertions(+) create mode 100644 pca_kmeans_movie_clustering/AmazonAIAlgorithmsIO_pb2.py create mode 100644 pca_kmeans_movie_clustering/convert_data.py create mode 100644 pca_kmeans_movie_clustering/pca_kmeans_movie_clustering.ipynb create mode 100644 pca_kmeans_movie_clustering/record_pb2.py diff --git a/pca_kmeans_movie_clustering/AmazonAIAlgorithmsIO_pb2.py b/pca_kmeans_movie_clustering/AmazonAIAlgorithmsIO_pb2.py new file mode 100644 index 0000000000..ca6db7a35f --- /dev/null +++ b/pca_kmeans_movie_clustering/AmazonAIAlgorithmsIO_pb2.py @@ -0,0 +1,101 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: AmazonAIAlgorithmsIO.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='AmazonAIAlgorithmsIO.proto', + package='AmazonAIAlgorithmsIO', + syntax='proto2', + serialized_pb=_b('\n\x1a\x41mazonAIAlgorithmsIO.proto\x12\x14\x41mazonAIAlgorithmsIO\"\\\n\x06Record\x12\x10\n\x04keys\x18\x01 \x03(\x04\x42\x02\x10\x01\x12\x12\n\x06values\x18\x02 \x03(\x02\x42\x02\x10\x01\x12\r\n\x05label\x18\x03 \x01(\x01\x12\x0b\n\x03uid\x18\x04 \x01(\t\x12\x10\n\x08metadata\x18\x05 \x01(\t') +) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + + + +_RECORD = _descriptor.Descriptor( + name='Record', + full_name='AmazonAIAlgorithmsIO.Record', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='keys', full_name='AmazonAIAlgorithmsIO.Record.keys', index=0, + number=1, type=4, cpp_type=4, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + _descriptor.FieldDescriptor( + name='values', full_name='AmazonAIAlgorithmsIO.Record.values', index=1, + number=2, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + _descriptor.FieldDescriptor( + name='label', full_name='AmazonAIAlgorithmsIO.Record.label', index=2, + number=3, type=1, cpp_type=5, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='uid', full_name='AmazonAIAlgorithmsIO.Record.uid', index=3, + number=4, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='metadata', full_name='AmazonAIAlgorithmsIO.Record.metadata', index=4, + number=5, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=52, + serialized_end=144, +) + +DESCRIPTOR.message_types_by_name['Record'] = _RECORD + +Record = _reflection.GeneratedProtocolMessageType('Record', (_message.Message,), dict( + DESCRIPTOR = _RECORD, + __module__ = 'AmazonAIAlgorithmsIO_pb2' + # @@protoc_insertion_point(class_scope:AmazonAIAlgorithmsIO.Record) + )) +_sym_db.RegisterMessage(Record) + + +_RECORD.fields_by_name['keys'].has_options = True +_RECORD.fields_by_name['keys']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +_RECORD.fields_by_name['values'].has_options = True +_RECORD.fields_by_name['values']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +# @@protoc_insertion_point(module_scope) diff --git a/pca_kmeans_movie_clustering/convert_data.py b/pca_kmeans_movie_clustering/convert_data.py new file mode 100644 index 0000000000..419c440e51 --- /dev/null +++ b/pca_kmeans_movie_clustering/convert_data.py @@ -0,0 +1,90 @@ +import struct +import io +import boto3 +import sys + +import AmazonAIAlgorithmsIO_pb2 +from record_pb2 import Record + + +def write_recordio(f, data): + kmagic = 0xced7230a + length = len(data) + f.write(struct.pack('I', kmagic)) + f.write(struct.pack('I', length)) + upper_align = ((length + 3) >> 2) << 2 + padding = bytes([0x00 for _ in range(upper_align - length)]) + f.write(data) + f.write(padding) + + +def list_to_record_bytes(values, keys=None, label=None, feature_size=None): + record = Record() + + record.features['values'].float32_tensor.values.extend(values) + + if keys is not None: + if feature_size is None: + raise ValueError("For sparse tensors the feature size must be specified.") + + record.features['values'].float32_tensor.keys.extend(keys) + + if feature_size is not None: + record.features['values'].float32_tensor.shape.extend([feature_size]) + + if label is not None: + record.label['values'].float32_tensor.values.extend([label]) + + return record.SerializeToString() + + +def read_next(f): + kmagic = 0xced7230a + raw_bytes = f.read(4) + if not raw_bytes: + return + m = struct.unpack('I', raw_bytes)[0] + if m != kmagic: + raise ValueError("Incorrect encoding") + length = struct.unpack('I', f.read(4))[0] + upper_align = ((length + 3) >> 2) << 2 + data = f.read(upper_align) + return data[:length] + + +def to_proto(f, labels, vectors): + for label, vec in zip(labels, vectors): + record = AmazonAIAlgorithmsIO_pb2.Record() + record.values.extend(vec) + record.label = label + write_recordio(f, record.SerializeToString()) + + +def to_libsvm(f, labels, values): + f.write('\n'.join( + ['{} {}'.format(label, ' '.join(['{}:{}'.format(i + 1, el) for i, el in enumerate(vec)])) for label, vec in + zip(labels, values)])) + return f + + +def write_to_s3(fobj, bucket, key): + return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(fobj) + + +def upload_to_s3(partition_name, partition, bucket): + labels = [t.tolist() for t in partition[1]] + vectors = [t.tolist() for t in partition[0]] + f = io.BytesIO() + to_proto(f, labels, vectors) + f.seek(0) + key = "{}/examples".format(partition_name) + url = 's3n://{}/{}'.format(bucket, key) + print('Writing to {}'.format(url)) + write_to_s3(f, bucket, key) + print('Done writing to {}'.format(url)) + + +def convert_data(partitions, bucket): + for partition_name, partition in partitions: + print('{}: {} {}'.format(partition_name, partition[0].shape, partition[1].shape)) + upload_to_s3(partition_name, partition, bucket) diff --git a/pca_kmeans_movie_clustering/pca_kmeans_movie_clustering.ipynb b/pca_kmeans_movie_clustering/pca_kmeans_movie_clustering.ipynb new file mode 100644 index 0000000000..8f1b9124cc --- /dev/null +++ b/pca_kmeans_movie_clustering/pca_kmeans_movie_clustering.ipynb @@ -0,0 +1,896 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IMDb Movie Segments\n", + "**Using principal components and k-means to find clusters of similar movies**\n", + "\n", + "# Contents\n", + "\n", + "1. [Background](#Background)\n", + "1. [Setup](#Setup)\n", + "1. [Data](#Data)\n", + " 1. [Import and Unzip](##Import and Unzip)\n", + " 1. [Transform](##Transform)\n", + " 1. [Visualize](###Visualize)\n", + " 1. [Upload](##Upload)\n", + "1. [Train PCA](#Train PCA)\n", + "1. [Host PCA](#Host PCA)\n", + " 1. [Score PCA](##Score PCA)\n", + " 1. [Visualize Components](#Visualize Components)\n", + "1. [Train k-means](#Train k-means)\n", + "1. [Host k-means](#Host k-means)\n", + " 1. [Score k-means](##Score k-means)\n", + "\n", + "# Background\n", + "\n", + "Clustering is a common unsupervised machine learning task, used in contexts from marketing to recommender systems. However, clustering does have difficulty in very high dimensional spaces, where all observations in the data start to look dissimilar because they randomly happen to differ on some (potentially irrelevant) feature.\n", + "\n", + "To correct for this, dimensionality reduction techniques are often used to bring the data into a lower dimensional space, reducing redundant variance, and allowing for better clustering solutions.\n", + "\n", + "In this notebook, we walk through an example which starts with IMDb movie data on genre, ratings, age, etc. and utilizes Principal Component Analysis (PCA) for dimensionality reduction, and k-means for clustering within that reduced dimensional space.\n", + "\n", + "---\n", + "# Setup\n", + "\n", + "Here we specify the linkage and authentication to AWS services. There are three parts to this:\n", + "\n", + "* The credentials and region for the account that's running training. Upload the credentials in the normal AWS credentials file format to '~/.aws/' or run 'aws configure' from a Jupyter terminal. The region must always be `us-west-2` during the Beta program.\n", + "* The roles used to give learning and hosting access to your data. See the documentation for how to specify these.\n", + "* The S3 bucket that you want to use for training and model data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "isConfigCell": true + }, + "outputs": [], + "source": [ + "import os\n", + "import boto3\n", + "\n", + "os.environ['AWS_DEFAULT_REGION'] = 'us-west-2'\n", + "role = boto3.client('iam').list_instance_profiles()['InstanceProfiles'][0]['Roles'][0]['Arn']\n", + "\n", + "bucket = ''\n", + "pca_prefix = 'pca_kmeans_movie_clustering/pca'\n", + "kmeans_prefix = 'pca_kmeans_movie_clustering/kmeans'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's also bring in the Python libraries we'll want to use for this exercise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import sys\n", + "import convert_data\n", + "import boto3\n", + "import time\n", + "import json\n", + "import io\n", + "import matplotlib.pyplot as plt\n", + "from IPython.display import display" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Data\n", + "\n", + "For this Notebook, we'll be using the IMBb dataset which is openly available on S3. There is a great deal of detail, but to keep this straightforward, let's limit ourselves to basic details and user ratings for movies.\n", + "\n", + "## Import and Unzip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "!aws s3api get-object --request-payer requester --bucket imdb-datasets --key documents/v1/current/title.basics.tsv.gz ./title.basics.tsv.gz\n", + "!aws s3api get-object --request-payer requester --bucket imdb-datasets --key documents/v1/current/title.ratings.tsv.gz ./title.ratings.tsv.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "!gunzip -f title.basics.tsv.gz\n", + "!gunzip -f title.ratings.tsv.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transform\n", + "\n", + "Let's filter down to just movies and remove those with incomplete or irrelevant data or a small number of reviews." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "basics = pd.read_csv('title.basics.tsv', sep='\\t')\n", + "movies = basics[(basics['titleType'] == 'movie') & \\\n", + " (basics['isAdult'] == 0) & \\\n", + " (basics['startYear'] != '\\\\N') & \\\n", + " (basics['runtimeMinutes'] != '\\\\N')]\n", + "ratings = pd.read_csv('title.ratings.tsv', sep='\\t')\n", + "movies = movies.merge(ratings[ratings['numVotes'] >= 100], on='tconst')\n", + "movies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are text columns which need to be converted to a numeric representation in order to use them in our machine learning models. In this case, that text information is genre. We'd like to convert this single column into a set of columns which are 1 if the movie is a member of that genre and 0 otherwise. Since a movie can be in multiple genres, the below pre-processing is necessary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def split_indicators(df, col):\n", + " keys = df[col].unique()\n", + " split_keys = pd.concat([pd.DataFrame(keys), pd.Series(keys).apply(lambda x: pd.Series([i for i in x.split(',')]))], axis=1)\n", + " split_keys.columns = [col] + ['x.{}'.format(i) for i in range(1, len(split_keys.columns))]\n", + " key_list = split_keys.melt(id_vars=col)\n", + " key_list['dummy'] = 1\n", + " return key_list.pivot_table(index=col, columns='value', values='dummy').fillna(0)\n", + "\n", + "def add_indicators(df, col):\n", + " indicators = split_indicators(df, col)\n", + " indicators[col] = indicators.index\n", + " return df.merge(indicators, on=col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "movies = add_indicators(movies, 'genres')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's:\n", + "1. Convert all columns to numbers\n", + "1. Drop columns that we won't use as features for training our machine learning algorithms\n", + "1. Standardize (give each column a mean of 0 and a standard deviation of 1 since they columns like startYear are on a completely different scale than our averageRating)\n", + "1. Convert to numpy matrix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "movies['startYear'] = pd.to_numeric(movies['startYear'])\n", + "movies['runtimeMinutes'] = pd.to_numeric(movies['runtimeMinutes'])\n", + "train_data = movies.drop(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'endYear', 'genres', '\\\\N'], axis=1)\n", + "train_data = (train_data - train_data.mean()) / train_data.std()\n", + "train_data = train_data.as_matrix().astype(float)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualize\n", + "\n", + "The best case scenario in clustering is that the machine learning model is more of a formality, with the clusters already being visibly apparent. However, the higher the dimensional space, the more difficult it becomes. Let's look at scatterplots for just the first few columns in our training data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.plotting.scatter_matrix(pd.DataFrame(train_data).iloc[:, 0:5], figsize=(12, 12))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, the data are often continuously distributed, with occasional outliers, but minimal other distinction which we can use to visibly separate them into clusters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload\n", + "\n", + "Let's upload the data to S3 in so that we can train our model in EASE. Notice we are using the convert_data functions which write our in memory datasets to a recordIO protobuf format for improved performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "train_file = 'pca_train.data'\n", + "\n", + "f = io.BytesIO()\n", + "for row in train_data:\n", + " convert_data.write_recordio(f, convert_data.list_to_record_bytes(row, label=0, feature_size=31))\n", + "f.seek(0)\n", + "\n", + "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(pca_prefix, 'train', train_file)).upload_fileobj(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Train PCA\n", + "\n", + "PCA is a technique that...\n", + "\n", + "Let's start by specifying our training parameters needed for the IM API, including:\n", + "1. The role to use\n", + "1. Our training job name\n", + "1. The PCA algorithm container\n", + "1. Training instance type and count\n", + "1. S3 location for training data\n", + "1. S3 location for output data\n", + "1. Algorithm hyperparameters\n", + "1. Stopping conditions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca_job = 'pca-poc-' + time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n", + "\n", + "print(\"Job name is:\", pca_job)\n", + "\n", + "pca_training_params = {\n", + " \"RoleArn\": role,\n", + " \"TrainingJobName\": pca_job,\n", + " \"AlgorithmSpecification\": {\n", + " \"TrainingImage\": \"900597767885.dkr.ecr.us-east-1.amazonaws.com/ease-pca:latest\",\n", + " \"TrainingInputMode\": \"File\"\n", + " },\n", + " \"ResourceConfig\": {\n", + " \"InstanceCount\": 2,\n", + " \"InstanceType\": \"c4.8xlarge\",\n", + " \"VolumeSizeInGB\": 50\n", + " },\n", + " \"InputDataConfig\": [\n", + " {\n", + " \"ChannelName\": \"train\",\n", + " \"DataSource\": {\n", + " \"S3DataSource\": {\n", + " \"S3DataType\": \"S3Prefix\",\n", + " \"S3Uri\": \"s3://{}/{}/train/\".format(bucket, pca_prefix),\n", + " \"S3DataDistributionType\": \"FullyReplicated\"\n", + " }\n", + " },\n", + " \"CompressionType\": \"None\",\n", + " \"RecordWrapperType\": \"None\"\n", + " }\n", + " ],\n", + " \"OutputDataConfig\": {\n", + " \"S3OutputPath\": \"s3://{}/{}/\".format(bucket, pca_prefix)\n", + " },\n", + " \"HyperParameters\": {\n", + " 'algorithm_mode': 'randomized',\n", + " 'num_components': '5',\n", + " 'subtract_mean': 'True',\n", + " 'extra_components': '-1',\n", + " 'feature_dim': '31',\n", + " 'mini_batch_size': '5000'\n", + " },\n", + " \"StoppingCondition\": {\n", + " \"MaxRuntimeInHours\": 1\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's kick off our training job on EASE, using the parameters we just created. Because training is serverless, we don't have to wait for our job to finish to continue, but for this case, let's setup a while loop so we can monitor the status of our training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "im = boto3.client('im')\n", + "im.create_training_job(**pca_training_params)\n", + "\n", + "status = im.describe_training_job(TrainingJobName=pca_job)['TrainingJobStatus']\n", + "print(status)\n", + "im.get_waiter('TrainingJob_Created').wait(TrainingJobName=pca_job)\n", + "if status == 'Failed':\n", + " message = im.describe_training_job(TrainingJobName=pca_job)['FailureReason']\n", + " print('Training failed with the following error: {}'.format(message))\n", + " raise Exception('Training job failed')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Host PCA\n", + "\n", + "Now that we've trained the PCA algorithm on our data, let's setup a model which can later be hosted. We will:\n", + "1. Point to the scoring container\n", + "1. Point to the model.tar.gz that came from training\n", + "1. Create the hosting model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca_hosting_container = {\n", + " 'Image': \"900597767885.dkr.ecr.us-east-1.amazonaws.com/ease-pca:latest\",\n", + " 'ModelDataUrl': im.describe_training_job(TrainingJobName=pca_job)['ModelArtifacts']['S3ModelArtifacts']\n", + "}\n", + "\n", + "create_model_response = im.create_model(\n", + " ModelName=pca_job,\n", + " ExecutionRoleArn=role,\n", + " PrimaryContainer=pca_hosting_container)\n", + "\n", + "print(create_model_response['ModelArn'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we've setup a model, we can configure what our hosting endpoints should be. Here we specify:\n", + "1. EC2 instance type to use for hosting\n", + "1. Lower and upper bounds for number of instances\n", + "1. Our hosting model name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pca_endpoint_config = 'pca-poc-endpoint-config-' + time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n", + "print(pca_endpoint_config)\n", + "create_endpoint_config_response = im.create_endpoint_config(\n", + " EndpointConfigName=pca_endpoint_config,\n", + " ProductionVariants=[{\n", + " 'InstanceType': 'c4.xlarge',\n", + " 'MaxInstanceCount': 3,\n", + " 'MinInstanceCount': 1,\n", + " 'ModelName': pca_job,\n", + " 'VariantName': 'AllTraffic'}])\n", + "\n", + "print(\"Endpoint Config Arn: \" + create_endpoint_config_response['EndpointConfigArn'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we've specified how our endpoint should be configured, we can create them. This can be done in the background, but for now let's run a loop that updates us on the status of the endpoints so that we know when they are ready for use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "pca_endpoint = 'pca-poc-endpoint-' + time.strftime(\"%Y%m%d%H%M\", time.gmtime())\n", + "print(pca_endpoint)\n", + "create_endpoint_response = im.create_endpoint(\n", + " EndpointName=pca_endpoint,\n", + " EndpointConfigName=pca_endpoint_config)\n", + "print(create_endpoint_response['EndpointArn'])\n", + "\n", + "resp = im.describe_endpoint(EndpointName=pca_endpoint)\n", + "status = resp['EndpointStatus']\n", + "print(\"Status: \" + status)\n", + "\n", + "im.get_waiter('Endpoint_Created').wait(EndpointName=pca_endpoint)\n", + "\n", + "resp = im.describe_endpoint(EndpointName=pca_endpoint)\n", + "status = resp['EndpointStatus']\n", + "print(\"Arn: \" + resp['EndpointArn'])\n", + "print(\"Status: \" + status)\n", + "\n", + "if status != 'InService':\n", + " raise Exception('Endpoint creation did not succeed')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Score PCA\n", + "\n", + "Now that our endpoint is live, we can generate predictions from it. In this case, we'll use it to score our training data, which results in the reduced dimensional components." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def np2csv(arr):\n", + " csv = io.BytesIO()\n", + " np.savetxt(csv, arr, delimiter=',', fmt='%g')\n", + " return csv.getvalue().decode().rstrip()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "runtime = boto3.Session().client(service_name='runtime.maeve', endpoint_url='https://maeveruntime.prod.us-west-2.ml-platform.aws.a2z.com')\n", + "\n", + "minibatch_rows = 5000000. / sys.getsizeof(np2csv(train_data[0]))\n", + "split_array = np.array_split(train_data, int(train_data.shape[0] / float(minibatch_rows) + 1))\n", + "components = []\n", + "for array in split_array:\n", + " payload = np2csv(array)\n", + " response = runtime.invoke_endpoint(EndpointName=pca_endpoint,\n", + " ContentType='text/csv',\n", + " Body=payload)\n", + " result = json.loads(response['Body'].read().decode())\n", + " components += [p['projection'] for p in result['projections']]\n", + "\n", + "components = np.array(components)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualize Components\n", + "\n", + "As mentioned above, ideally the clusters would already be visibly apparent in our data. Now that we've run PCA to reduce the dimensionality of our data, let's look at some scatterplots to understand if we can easily make out any groups of movies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.plotting.scatter_matrix(pd.DataFrame(components), figsize=(12, 12))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The scatterplots tend to be dominated by one large mass of data points, but there are also several other sizable goups which are noticeably distinct. We can utilize k-means to find these groupings a robust manner." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Train k-means\n", + "\n", + "Next, let's run k-means on our reduced dimensional output. Start by outputting the data to S3. Notice we'll use the same bucket, but a different S3 prefix to avoid supplying conflicting training data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# TODO update to newer protobuf format\n", + "train_file = 'kmeans_train.data'\n", + "\n", + "vectors = [t.tolist() for t in components]\n", + "labels = [t.tolist() for t in components[:, 0]]\n", + "\n", + "f = io.BytesIO()\n", + "convert_data.to_proto(f, labels=labels, vectors=vectors)\n", + "f.seek(0)\n", + "\n", + "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(kmeans_prefix, 'train', train_file)).upload_fileobj(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we'll setup our k-means training parameters. This is essentially the same as our definition for pca_training_params except we've changed:\n", + "1. The container image to k-means\n", + "1. S3 output path\n", + "1. Algorithm hyperparameters (notice our feature dimension is now 5 as we're clustering the 5 components output by PCA)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO update to newer container... Hosting already is\n", + "kmeans_job = 'kmeans-poc-' + time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n", + "\n", + "print(\"Job name is:\", kmeans_job)\n", + "\n", + "kmeans_training_params = {\n", + " \"RoleArn\": role,\n", + " \"TrainingJobName\": kmeans_job,\n", + " \"AlgorithmSpecification\": {\n", + " \"TrainingImage\": \"900597767885.dkr.ecr.us-east-1.amazonaws.com/kmeanswebscale:latest\",\n", + " \"TrainingInputMode\": \"File\"\n", + " },\n", + " \"ResourceConfig\": {\n", + " \"InstanceCount\": 2,\n", + " \"InstanceType\": \"c4.8xlarge\",\n", + " \"VolumeSizeInGB\": 50\n", + " },\n", + " \"InputDataConfig\": [\n", + " {\n", + " \"ChannelName\": \"train\",\n", + " \"DataSource\": {\n", + " \"S3DataSource\": {\n", + " \"S3DataType\": \"S3Prefix\",\n", + " \"S3Uri\": \"s3://{}/{}/train\".format(bucket, kmeans_prefix),\n", + " \"S3DataDistributionType\": \"FullyReplicated\"\n", + " }\n", + " },\n", + " \"CompressionType\": \"None\",\n", + " \"RecordWrapperType\": \"None\"\n", + " }\n", + " ],\n", + " \"OutputDataConfig\": {\n", + " \"S3OutputPath\": \"s3://{}/{}/\".format(bucket, kmeans_prefix)\n", + " },\n", + " \"HyperParameters\": {\n", + " \"num_clusters\": \"8\",\n", + " \"feature_dim\": \"5\",\n", + " \"mini_batch_size\": \"5000\",\n", + " \"init_method\": \"random\",\n", + " \"epochs\": \"1\"\n", + " },\n", + " \"StoppingCondition\": {\n", + " \"MaxRuntimeInHours\": 1\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now invoke EASE for serverless training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "im = boto3.client('im')\n", + "im.create_training_job(**kmeans_training_params)\n", + "\n", + "status = im.describe_training_job(TrainingJobName=kmeans_job)['TrainingJobStatus']\n", + "print(status)\n", + "im.get_waiter('TrainingJob_Created').wait(TrainingJobName=kmeans_job)\n", + "if status == 'Failed':\n", + " message = im.describe_training_job(TrainingJobName=kmeans_job)['FailureReason']\n", + " print('Training failed with the following error: {}'.format(message))\n", + " raise Exception('Training job failed')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Host k-means\n", + "\n", + "Define our model for hosting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kmeans_hosting_container = {\n", + " 'Image': \"900597767885.dkr.ecr.us-east-1.amazonaws.com/aialgorithmskmeanswebscalecontainer:latest\",\n", + " 'ModelDataUrl': im.describe_training_job(TrainingJobName=kmeans_job)['ModelArtifacts']['S3ModelArtifacts']\n", + "}\n", + "\n", + "create_model_response = im.create_model(\n", + " ModelName=kmeans_job,\n", + " ExecutionRoleArn=role,\n", + " PrimaryContainer=kmeans_hosting_container)\n", + "\n", + "print(create_model_response['ModelArn'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Setup our endpoint configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kmeans_endpoint_config = 'kmeans-poc-endpoint-config-' + time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n", + "print(kmeans_endpoint_config)\n", + "create_endpoint_config_response = im.create_endpoint_config(\n", + " EndpointConfigName=kmeans_endpoint_config,\n", + " ProductionVariants=[{\n", + " 'InstanceType': 'c4.xlarge',\n", + " 'MaxInstanceCount': 3,\n", + " 'MinInstanceCount': 1,\n", + " 'ModelName': kmeans_job,\n", + " 'VariantName': 'AllTraffic'}])\n", + "\n", + "print(\"Endpoint Config Arn: \" + create_endpoint_config_response['EndpointConfigArn'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initiate our endpoints." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "kmeans_endpoint = 'kmeans-poc-endpoint-' + time.strftime(\"%Y%m%d%H%M\", time.gmtime())\n", + "print(kmeans_endpoint)\n", + "create_endpoint_response = im.create_endpoint(\n", + " EndpointName=kmeans_endpoint,\n", + " EndpointConfigName=kmeans_endpoint_config)\n", + "print(create_endpoint_response['EndpointArn'])\n", + "\n", + "resp = im.describe_endpoint(EndpointName=kmeans_endpoint)\n", + "status = resp['EndpointStatus']\n", + "print(\"Status: \" + status)\n", + "\n", + "im.get_waiter('Endpoint_Created').wait(EndpointName=kmeans_endpoint)\n", + "\n", + "resp = im.describe_endpoint(EndpointName=kmeans_endpoint)\n", + "status = resp['EndpointStatus']\n", + "print(\"Arn: \" + resp['EndpointArn'])\n", + "print(\"Status: \" + status)\n", + "\n", + "if status != 'InService':\n", + " raise Exception('Endpoint creation did not succeed')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Score k-means\n", + "\n", + "Now that our endpoint is live, we can generate predictions from it. In this case, we'll use it to score our training data, which results in the assigned cluster for each movie." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "runtime = boto3.Session().client(service_name='runtime.maeve', endpoint_url='https://maeveruntime.prod.us-west-2.ml-platform.aws.a2z.com')\n", + "\n", + "minibatch_rows = 5000000. / sys.getsizeof(np2csv(components[0]))\n", + "split_array = np.array_split(components, int(components.shape[0] / float(minibatch_rows) + 1))\n", + "clusters = []\n", + "for array in split_array:\n", + " payload = np2csv(array)\n", + " response = runtime.invoke_endpoint(EndpointName=kmeans_endpoint,\n", + " ContentType='text/csv',\n", + " Body=payload)\n", + " result = json.loads(response['Body'].read().decode())\n", + " clusters += [r['closest_cluster'] for r in result['predictions']]\n", + "\n", + "\n", + "movies['cluster'] = clusters\n", + "movies['cluster'] = movies['cluster'].astype(object)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a quick look at how the clusters differ from one another.\n", + "\n", + "_Note that because of random initialization of cluster centroids, results may vary slightly on across runs._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.crosstab(index=movies['cluster'], columns='% observations', normalize='columns')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most movies belong to one of two large clusters, but there aren't an unreasonably small clusters, which is a good sign. Let's look at how their distributions differ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "for column in ['startYear', 'averageRating', 'numVotes']: #movies.select_dtypes(exclude=['object']).columns:\n", + " print(column)\n", + " hist = movies[[column, 'cluster']].hist(by='cluster', bins=30, figsize=(12, 3), layout=(1, 8))\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see:\n", + "- Clusters 2, 5, and 6 (in particular) skew toward substantially earlier release dates.\n", + "- Clusters 0, 3, and 5 have wider ratings distributions\n", + "- Cluster 7 appears to skew toward the most popular movies, with a larger portion having a very high number of reviews.\n", + "\n", + "Now let's get recent examples from each cluster." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "for group in movies[(movies['startYear'] > 2000) & (movies['numVotes'] > 1000)].groupby('cluster'):\n", + " print('Cluster:', np.max(group[1]['cluster']))\n", + " display(group[1].sample(n=10, replace=True, random_state=0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see:\n", + "- Cluster 0 includes a wide variety of movies, some foreign, some low budget, most with mediocre ratings.\n", + "- Cluster 1 has many Romance and Drama films.\n", + "- Cluster 2 tends to be Crime and Thriller movies.\n", + "- Cluster 3 is largely low rated Horror.\n", + "- Cluster 4 is largely Biographies.\n", + "- Cluster 5 is another broad category with, some foreign films and others with mediocre ratings.\n", + "- Cluster 6 appears to be mostly Musicals.\n", + "- Cluster 7 is mostly Hollywood blockbusters with broad appeal.\n", + "\n", + "Although many of the clusters are defined by genre, there are clusters like 2, which span multiple related genres, and do not require membership in both to be included in the cluster." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pca_kmeans_movie_clustering/record_pb2.py b/pca_kmeans_movie_clustering/record_pb2.py new file mode 100644 index 0000000000..e49d21d030 --- /dev/null +++ b/pca_kmeans_movie_clustering/record_pb2.py @@ -0,0 +1,501 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: src/ai_algorithms_protobuf_python/record.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='src/ai_algorithms_protobuf_python/record.proto', + package='aialgs.data', + syntax='proto2', + serialized_pb=_b('\n.src/ai_algorithms_protobuf_python/record.proto\x12\x0b\x61ialgs.data\"H\n\rFloat32Tensor\x12\x12\n\x06values\x18\x01 \x03(\x02\x42\x02\x10\x01\x12\x10\n\x04keys\x18\x02 \x03(\x04\x42\x02\x10\x01\x12\x11\n\x05shape\x18\x03 \x03(\x04\x42\x02\x10\x01\"H\n\rFloat64Tensor\x12\x12\n\x06values\x18\x01 \x03(\x01\x42\x02\x10\x01\x12\x10\n\x04keys\x18\x02 \x03(\x04\x42\x02\x10\x01\x12\x11\n\x05shape\x18\x03 \x03(\x04\x42\x02\x10\x01\"F\n\x0bInt32Tensor\x12\x12\n\x06values\x18\x01 \x03(\x05\x42\x02\x10\x01\x12\x10\n\x04keys\x18\x02 \x03(\x04\x42\x02\x10\x01\x12\x11\n\x05shape\x18\x03 \x03(\x04\x42\x02\x10\x01\",\n\x05\x42ytes\x12\r\n\x05value\x18\x01 \x03(\x0c\x12\x14\n\x0c\x63ontent_type\x18\x02 \x01(\t\"\xd3\x01\n\x05Value\x12\x34\n\x0e\x66loat32_tensor\x18\x02 \x01(\x0b\x32\x1a.aialgs.data.Float32TensorH\x00\x12\x34\n\x0e\x66loat64_tensor\x18\x03 \x01(\x0b\x32\x1a.aialgs.data.Float64TensorH\x00\x12\x30\n\x0cint32_tensor\x18\x07 \x01(\x0b\x32\x18.aialgs.data.Int32TensorH\x00\x12#\n\x05\x62ytes\x18\t \x01(\x0b\x32\x12.aialgs.data.BytesH\x00\x42\x07\n\x05value\"\xa9\x02\n\x06Record\x12\x33\n\x08\x66\x65\x61tures\x18\x01 \x03(\x0b\x32!.aialgs.data.Record.FeaturesEntry\x12-\n\x05label\x18\x02 \x03(\x0b\x32\x1e.aialgs.data.Record.LabelEntry\x12\x0b\n\x03uid\x18\x03 \x01(\t\x12\x10\n\x08metadata\x18\x04 \x01(\t\x12\x15\n\rconfiguration\x18\x05 \x01(\t\x1a\x43\n\rFeaturesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12!\n\x05value\x18\x02 \x01(\x0b\x32\x12.aialgs.data.Value:\x02\x38\x01\x1a@\n\nLabelEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12!\n\x05value\x18\x02 \x01(\x0b\x32\x12.aialgs.data.Value:\x02\x38\x01\x42\x30\n com.amazonaws.aialgorithms.protoB\x0cRecordProtos') +) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + + + +_FLOAT32TENSOR = _descriptor.Descriptor( + name='Float32Tensor', + full_name='aialgs.data.Float32Tensor', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='values', full_name='aialgs.data.Float32Tensor.values', index=0, + number=1, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + _descriptor.FieldDescriptor( + name='keys', full_name='aialgs.data.Float32Tensor.keys', index=1, + number=2, type=4, cpp_type=4, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + _descriptor.FieldDescriptor( + name='shape', full_name='aialgs.data.Float32Tensor.shape', index=2, + number=3, type=4, cpp_type=4, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=63, + serialized_end=135, +) + + +_FLOAT64TENSOR = _descriptor.Descriptor( + name='Float64Tensor', + full_name='aialgs.data.Float64Tensor', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='values', full_name='aialgs.data.Float64Tensor.values', index=0, + number=1, type=1, cpp_type=5, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + _descriptor.FieldDescriptor( + name='keys', full_name='aialgs.data.Float64Tensor.keys', index=1, + number=2, type=4, cpp_type=4, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + _descriptor.FieldDescriptor( + name='shape', full_name='aialgs.data.Float64Tensor.shape', index=2, + number=3, type=4, cpp_type=4, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=137, + serialized_end=209, +) + + +_INT32TENSOR = _descriptor.Descriptor( + name='Int32Tensor', + full_name='aialgs.data.Int32Tensor', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='values', full_name='aialgs.data.Int32Tensor.values', index=0, + number=1, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + _descriptor.FieldDescriptor( + name='keys', full_name='aialgs.data.Int32Tensor.keys', index=1, + number=2, type=4, cpp_type=4, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + _descriptor.FieldDescriptor( + name='shape', full_name='aialgs.data.Int32Tensor.shape', index=2, + number=3, type=4, cpp_type=4, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=_descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=211, + serialized_end=281, +) + + +_BYTES = _descriptor.Descriptor( + name='Bytes', + full_name='aialgs.data.Bytes', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='value', full_name='aialgs.data.Bytes.value', index=0, + number=1, type=12, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='content_type', full_name='aialgs.data.Bytes.content_type', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=283, + serialized_end=327, +) + + +_VALUE = _descriptor.Descriptor( + name='Value', + full_name='aialgs.data.Value', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='float32_tensor', full_name='aialgs.data.Value.float32_tensor', index=0, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='float64_tensor', full_name='aialgs.data.Value.float64_tensor', index=1, + number=3, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='int32_tensor', full_name='aialgs.data.Value.int32_tensor', index=2, + number=7, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='bytes', full_name='aialgs.data.Value.bytes', index=3, + number=9, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + _descriptor.OneofDescriptor( + name='value', full_name='aialgs.data.Value.value', + index=0, containing_type=None, fields=[]), + ], + serialized_start=330, + serialized_end=541, +) + + +_RECORD_FEATURESENTRY = _descriptor.Descriptor( + name='FeaturesEntry', + full_name='aialgs.data.Record.FeaturesEntry', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='key', full_name='aialgs.data.Record.FeaturesEntry.key', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='value', full_name='aialgs.data.Record.FeaturesEntry.value', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')), + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=708, + serialized_end=775, +) + +_RECORD_LABELENTRY = _descriptor.Descriptor( + name='LabelEntry', + full_name='aialgs.data.Record.LabelEntry', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='key', full_name='aialgs.data.Record.LabelEntry.key', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='value', full_name='aialgs.data.Record.LabelEntry.value', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')), + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=777, + serialized_end=841, +) + +_RECORD = _descriptor.Descriptor( + name='Record', + full_name='aialgs.data.Record', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='features', full_name='aialgs.data.Record.features', index=0, + number=1, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='label', full_name='aialgs.data.Record.label', index=1, + number=2, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='uid', full_name='aialgs.data.Record.uid', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='metadata', full_name='aialgs.data.Record.metadata', index=3, + number=4, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='configuration', full_name='aialgs.data.Record.configuration', index=4, + number=5, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[_RECORD_FEATURESENTRY, _RECORD_LABELENTRY, ], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=544, + serialized_end=841, +) + +_VALUE.fields_by_name['float32_tensor'].message_type = _FLOAT32TENSOR +_VALUE.fields_by_name['float64_tensor'].message_type = _FLOAT64TENSOR +_VALUE.fields_by_name['int32_tensor'].message_type = _INT32TENSOR +_VALUE.fields_by_name['bytes'].message_type = _BYTES +_VALUE.oneofs_by_name['value'].fields.append( + _VALUE.fields_by_name['float32_tensor']) +_VALUE.fields_by_name['float32_tensor'].containing_oneof = _VALUE.oneofs_by_name['value'] +_VALUE.oneofs_by_name['value'].fields.append( + _VALUE.fields_by_name['float64_tensor']) +_VALUE.fields_by_name['float64_tensor'].containing_oneof = _VALUE.oneofs_by_name['value'] +_VALUE.oneofs_by_name['value'].fields.append( + _VALUE.fields_by_name['int32_tensor']) +_VALUE.fields_by_name['int32_tensor'].containing_oneof = _VALUE.oneofs_by_name['value'] +_VALUE.oneofs_by_name['value'].fields.append( + _VALUE.fields_by_name['bytes']) +_VALUE.fields_by_name['bytes'].containing_oneof = _VALUE.oneofs_by_name['value'] +_RECORD_FEATURESENTRY.fields_by_name['value'].message_type = _VALUE +_RECORD_FEATURESENTRY.containing_type = _RECORD +_RECORD_LABELENTRY.fields_by_name['value'].message_type = _VALUE +_RECORD_LABELENTRY.containing_type = _RECORD +_RECORD.fields_by_name['features'].message_type = _RECORD_FEATURESENTRY +_RECORD.fields_by_name['label'].message_type = _RECORD_LABELENTRY +DESCRIPTOR.message_types_by_name['Float32Tensor'] = _FLOAT32TENSOR +DESCRIPTOR.message_types_by_name['Float64Tensor'] = _FLOAT64TENSOR +DESCRIPTOR.message_types_by_name['Int32Tensor'] = _INT32TENSOR +DESCRIPTOR.message_types_by_name['Bytes'] = _BYTES +DESCRIPTOR.message_types_by_name['Value'] = _VALUE +DESCRIPTOR.message_types_by_name['Record'] = _RECORD + +Float32Tensor = _reflection.GeneratedProtocolMessageType('Float32Tensor', (_message.Message,), dict( + DESCRIPTOR = _FLOAT32TENSOR, + __module__ = 'src.ai_algorithms_protobuf_python.record_pb2' + # @@protoc_insertion_point(class_scope:aialgs.data.Float32Tensor) + )) +_sym_db.RegisterMessage(Float32Tensor) + +Float64Tensor = _reflection.GeneratedProtocolMessageType('Float64Tensor', (_message.Message,), dict( + DESCRIPTOR = _FLOAT64TENSOR, + __module__ = 'src.ai_algorithms_protobuf_python.record_pb2' + # @@protoc_insertion_point(class_scope:aialgs.data.Float64Tensor) + )) +_sym_db.RegisterMessage(Float64Tensor) + +Int32Tensor = _reflection.GeneratedProtocolMessageType('Int32Tensor', (_message.Message,), dict( + DESCRIPTOR = _INT32TENSOR, + __module__ = 'src.ai_algorithms_protobuf_python.record_pb2' + # @@protoc_insertion_point(class_scope:aialgs.data.Int32Tensor) + )) +_sym_db.RegisterMessage(Int32Tensor) + +Bytes = _reflection.GeneratedProtocolMessageType('Bytes', (_message.Message,), dict( + DESCRIPTOR = _BYTES, + __module__ = 'src.ai_algorithms_protobuf_python.record_pb2' + # @@protoc_insertion_point(class_scope:aialgs.data.Bytes) + )) +_sym_db.RegisterMessage(Bytes) + +Value = _reflection.GeneratedProtocolMessageType('Value', (_message.Message,), dict( + DESCRIPTOR = _VALUE, + __module__ = 'src.ai_algorithms_protobuf_python.record_pb2' + # @@protoc_insertion_point(class_scope:aialgs.data.Value) + )) +_sym_db.RegisterMessage(Value) + +Record = _reflection.GeneratedProtocolMessageType('Record', (_message.Message,), dict( + + FeaturesEntry = _reflection.GeneratedProtocolMessageType('FeaturesEntry', (_message.Message,), dict( + DESCRIPTOR = _RECORD_FEATURESENTRY, + __module__ = 'src.ai_algorithms_protobuf_python.record_pb2' + # @@protoc_insertion_point(class_scope:aialgs.data.Record.FeaturesEntry) + )) + , + + LabelEntry = _reflection.GeneratedProtocolMessageType('LabelEntry', (_message.Message,), dict( + DESCRIPTOR = _RECORD_LABELENTRY, + __module__ = 'src.ai_algorithms_protobuf_python.record_pb2' + # @@protoc_insertion_point(class_scope:aialgs.data.Record.LabelEntry) + )) + , + DESCRIPTOR = _RECORD, + __module__ = 'src.ai_algorithms_protobuf_python.record_pb2' + # @@protoc_insertion_point(class_scope:aialgs.data.Record) + )) +_sym_db.RegisterMessage(Record) +_sym_db.RegisterMessage(Record.FeaturesEntry) +_sym_db.RegisterMessage(Record.LabelEntry) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n com.amazonaws.aialgorithms.protoB\014RecordProtos')) +_FLOAT32TENSOR.fields_by_name['values'].has_options = True +_FLOAT32TENSOR.fields_by_name['values']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +_FLOAT32TENSOR.fields_by_name['keys'].has_options = True +_FLOAT32TENSOR.fields_by_name['keys']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +_FLOAT32TENSOR.fields_by_name['shape'].has_options = True +_FLOAT32TENSOR.fields_by_name['shape']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +_FLOAT64TENSOR.fields_by_name['values'].has_options = True +_FLOAT64TENSOR.fields_by_name['values']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +_FLOAT64TENSOR.fields_by_name['keys'].has_options = True +_FLOAT64TENSOR.fields_by_name['keys']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +_FLOAT64TENSOR.fields_by_name['shape'].has_options = True +_FLOAT64TENSOR.fields_by_name['shape']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +_INT32TENSOR.fields_by_name['values'].has_options = True +_INT32TENSOR.fields_by_name['values']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +_INT32TENSOR.fields_by_name['keys'].has_options = True +_INT32TENSOR.fields_by_name['keys']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +_INT32TENSOR.fields_by_name['shape'].has_options = True +_INT32TENSOR.fields_by_name['shape']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001')) +_RECORD_FEATURESENTRY.has_options = True +_RECORD_FEATURESENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')) +_RECORD_LABELENTRY.has_options = True +_RECORD_LABELENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')) +# @@protoc_insertion_point(module_scope)