AWS Debiasing AI using Sage Maker./london_crime.py

# -*- coding: utf-8 -*-
"""London Crime

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Pov_6In2ZRcMW0CFEFUZgz8bJvILh8E9

# Import libraries
"""

# Commented out IPython magic to ensure Python compatibility.
import boto3
import pandas as pd
import numpy as np
import io
from sagemaker import get_execution_role
import os

# Amazon SageMaker Python SDK
import sagemaker.amazon.common as smac 
import sagemaker

# instruction to Jupyter to inline all the visualizations
# %matplotlib inline

"""# Data Ingestion

Next, we read the dataset from an online URL into memory, for preprocessing prior to training.
"""

role = get_execution_role()
input_bucket = 'linkedin-sagemaker' # bucket name
in_data_key = 'crime-data.csv' # file name
training_data_location = 's3://{}/{}'.format(input_bucket, in_data_key) 

# Define the headers since the data does not have any
headers = ['CommittedCrime','County','TimeofDay','DayofWeek','Month','Gender','AverageAge']

df = pd.read_csv(training_data_location, names=headers)
print('uploaded training data from location: {}'.format(training_data_location))

"""# Data Inspection & Visualization

Once the dataset is imported, it's typical as part of the machine learning process to inspect the data, understand the distributions, and determine what type(s) of preprocessing might be needed.
"""

#inspect the data
df

# So if I say df.Quantity,
#that's going to give me a single column of data,
#which is the quantities of the different transactions
#in donuts that we had.
df.County

df.AverageAge.hist()

#select a subset of records
df[df.TimeofDay == 'morning']

#look at relationship among different columns
#this shows the count of each county by committed crime
#So for Kent, 235 records where no crime and 365 records where there is crime
#So for Surrey, 437 no crime, 473 crime
pd.crosstab(df.County,df.CommittedCrime)

pd.crosstab(df.Gender,df.CommittedCrime)

"""# Data Cleaning

Get rid of null or bad values
"""

#Records that need to be removed because they are null or empty
df.isnull().sum()

#View Age records that are NaN
#In the cases where there aren't any transactions, it puts in a value of not a number.
df[df.AverageAge.isnull()]

#REMOVE AGE RECORDS THAT ARE NaN
df.dropna(subset=['AverageAge'], inplace=True)

#double check to see removed
df['AverageAge'].isnull().sum()

#Show the records with null or other
df[ (df.Gender == 'Other') | (df.Gender.isnull())]

#Remove the null records for Gender
df.dropna(subset=['Gender'], inplace=True)

#double check to see removed
df['Gender'].isnull().sum()

#show the records with Gender as Other
#NOTE: seems to be a common pattern with police in Hampshire when collecting data; they don't add a gender
df[(df.Gender == 'Other')]

#Remove the records that are 'Other' for Gender
#NOTE: Make sure to assign back the updated data frame
df = df[df.Gender != 'Other']

#double check to see if removed
df[(df.Gender == 'Other')]

#Let's look at dataframe again that contains the cleaned data
df

"""# Data Encoding & Transformation

Before you start training, you first need to prepare the data because classifiers only work with numerical values. Let's convert the categorical features into numeric features. 
"""

#check to see the data types we have
df.dtypes

#Use "find and replace" method to convert text to string
#Day of Week is easily translated to valid numbers - Monday is 1, Tuesday is 2, Wednesday is 3.....
#Month is easily translated to valid numbers - January is 1, February 2, March is 3 ......
#Gender is easily translated to valid numbers - 1 is Male, 2 is Female
convert_nums = {"DayofWeek": {"Monday": 1, 
                              "Tuesday": 2,
                              "Wednesday": 3,
                              "Thursday": 4,
                              "Friday": 5,
                              "Saturday": 6,
                              "Sunday": 7},
                "Month": {"January": 1, 
                          "February": 2, 
                          "March": 3, 
                          "April": 4,
                          "May": 5, 
                          "June": 6, 
                          "July": 7, 
                          "August": 8, 
                          "September": 9, 
                          "October": 10, 
                          "November": 11, 
                          "December": 12}}


#turn off copy warning
df.is_copy = False

#convert the columns to numbers using replace
df.replace(convert_nums, inplace=True)
df.head()

#Use "One Hot Encoding" method on values that can't be easily translated to a numeric value
#Convert each category value into a new column and assigns a 1 or 0 (True/False) value to the column. 
#This has the benefit of not weighting a value improperly but does add more columns to the data set.
#Pandas supports this feature using get_dummies. This function is named this way because 
#it creates dummy/indicator variables (aka 1 or 0)


#List all the unique values for Gender
df.Gender.unique()

#Convert using get_dummies; goes from 1 column to 2 with a 1 or 0 corresponding to the correct value
df = pd.get_dummies(df, columns=["Gender"])
df

#List all the unique values for TimeofDay
df.TimeofDay.unique()

#Convert using get_dummies; goes from 1 column to 3 with a 1 or 0 corresponding to the correct value
df = pd.get_dummies(df, columns=["TimeofDay"])
df

#List all the unique values for County
df.County.unique()

#Convert using get_dummies; goes from 1 column to 7 with a 1 or 0 corresponding to the correct value
df = pd.get_dummies(df, columns=["County"])
df

"""# Splitting into Training, Test and Validation Sets
This will help prevent us from overfitting the model, and allow us to test the models accuracy on data it hasn't already seen.
"""

# convert dataframe to csv; save the original cleaned file to notebook instance
df.to_csv('crime-data-cleaned.csv', header=False, index=False)

# Split the data for training, validation, and test into separate dataframes
# produces a 60%, 20%, 20% split for training, validation and test sets
train_data, validation_data, test_data = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

#convert dfs to .csv and save locally; file is saved to notebook instance
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)
test_data.to_csv('test.csv', header=False, index=False)

"""# Upload Training Data to S3
Now that we've created our CSV files, we'll need to upload them to S3, so that Amazon SageMaker training can use it.
"""

# load files to S3
# The S3 bucket, prefix, and file name that you want to use for training and model data 
output_bucket = 'linkedin-sagemaker'
prefix = 'cleaned'

boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'crime-data-cleaned.csv')).upload_file('crime-data-cleaned.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'train.csv')).upload_file('train.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'validation.csv')).upload_file('validation.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'test.csv')).upload_file('test.csv')