-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlondon_crime.py
207 lines (153 loc) · 7.12 KB
/
london_crime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# -*- coding: utf-8 -*-
"""London Crime
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Pov_6In2ZRcMW0CFEFUZgz8bJvILh8E9
# Import libraries
"""
# Commented out IPython magic to ensure Python compatibility.
import boto3
import pandas as pd
import numpy as np
import io
from sagemaker import get_execution_role
import os
# Amazon SageMaker Python SDK
import sagemaker.amazon.common as smac
import sagemaker
# instruction to Jupyter to inline all the visualizations
# %matplotlib inline
"""# Data Ingestion
Next, we read the dataset from an online URL into memory, for preprocessing prior to training.
"""
role = get_execution_role()
input_bucket = 'linkedin-sagemaker' # bucket name
in_data_key = 'crime-data.csv' # file name
training_data_location = 's3://{}/{}'.format(input_bucket, in_data_key)
# Define the headers since the data does not have any
headers = ['CommittedCrime','County','TimeofDay','DayofWeek','Month','Gender','AverageAge']
df = pd.read_csv(training_data_location, names=headers)
print('uploaded training data from location: {}'.format(training_data_location))
"""# Data Inspection & Visualization
Once the dataset is imported, it's typical as part of the machine learning process to inspect the data, understand the distributions, and determine what type(s) of preprocessing might be needed.
"""
#inspect the data
df
# So if I say df.Quantity,
#that's going to give me a single column of data,
#which is the quantities of the different transactions
#in donuts that we had.
df.County
df.AverageAge.hist()
#select a subset of records
df[df.TimeofDay == 'morning']
#look at relationship among different columns
#this shows the count of each county by committed crime
#So for Kent, 235 records where no crime and 365 records where there is crime
#So for Surrey, 437 no crime, 473 crime
pd.crosstab(df.County,df.CommittedCrime)
pd.crosstab(df.Gender,df.CommittedCrime)
"""# Data Cleaning
Get rid of null or bad values
"""
#Records that need to be removed because they are null or empty
df.isnull().sum()
#View Age records that are NaN
#In the cases where there aren't any transactions, it puts in a value of not a number.
df[df.AverageAge.isnull()]
#REMOVE AGE RECORDS THAT ARE NaN
df.dropna(subset=['AverageAge'], inplace=True)
#double check to see removed
df['AverageAge'].isnull().sum()
#Show the records with null or other
df[ (df.Gender == 'Other') | (df.Gender.isnull())]
#Remove the null records for Gender
df.dropna(subset=['Gender'], inplace=True)
#double check to see removed
df['Gender'].isnull().sum()
#show the records with Gender as Other
#NOTE: seems to be a common pattern with police in Hampshire when collecting data; they don't add a gender
df[(df.Gender == 'Other')]
#Remove the records that are 'Other' for Gender
#NOTE: Make sure to assign back the updated data frame
df = df[df.Gender != 'Other']
#double check to see if removed
df[(df.Gender == 'Other')]
#Let's look at dataframe again that contains the cleaned data
df
"""# Data Encoding & Transformation
Before you start training, you first need to prepare the data because classifiers only work with numerical values. Let's convert the categorical features into numeric features.
"""
#check to see the data types we have
df.dtypes
#Use "find and replace" method to convert text to string
#Day of Week is easily translated to valid numbers - Monday is 1, Tuesday is 2, Wednesday is 3.....
#Month is easily translated to valid numbers - January is 1, February 2, March is 3 ......
#Gender is easily translated to valid numbers - 1 is Male, 2 is Female
convert_nums = {"DayofWeek": {"Monday": 1,
"Tuesday": 2,
"Wednesday": 3,
"Thursday": 4,
"Friday": 5,
"Saturday": 6,
"Sunday": 7},
"Month": {"January": 1,
"February": 2,
"March": 3,
"April": 4,
"May": 5,
"June": 6,
"July": 7,
"August": 8,
"September": 9,
"October": 10,
"November": 11,
"December": 12}}
#turn off copy warning
df.is_copy = False
#convert the columns to numbers using replace
df.replace(convert_nums, inplace=True)
df.head()
#Use "One Hot Encoding" method on values that can't be easily translated to a numeric value
#Convert each category value into a new column and assigns a 1 or 0 (True/False) value to the column.
#This has the benefit of not weighting a value improperly but does add more columns to the data set.
#Pandas supports this feature using get_dummies. This function is named this way because
#it creates dummy/indicator variables (aka 1 or 0)
#List all the unique values for Gender
df.Gender.unique()
#Convert using get_dummies; goes from 1 column to 2 with a 1 or 0 corresponding to the correct value
df = pd.get_dummies(df, columns=["Gender"])
df
#List all the unique values for TimeofDay
df.TimeofDay.unique()
#Convert using get_dummies; goes from 1 column to 3 with a 1 or 0 corresponding to the correct value
df = pd.get_dummies(df, columns=["TimeofDay"])
df
#List all the unique values for County
df.County.unique()
#Convert using get_dummies; goes from 1 column to 7 with a 1 or 0 corresponding to the correct value
df = pd.get_dummies(df, columns=["County"])
df
"""# Splitting into Training, Test and Validation Sets
This will help prevent us from overfitting the model, and allow us to test the models accuracy on data it hasn't already seen.
"""
# convert dataframe to csv; save the original cleaned file to notebook instance
df.to_csv('crime-data-cleaned.csv', header=False, index=False)
# Split the data for training, validation, and test into separate dataframes
# produces a 60%, 20%, 20% split for training, validation and test sets
train_data, validation_data, test_data = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
#convert dfs to .csv and save locally; file is saved to notebook instance
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)
test_data.to_csv('test.csv', header=False, index=False)
"""# Upload Training Data to S3
Now that we've created our CSV files, we'll need to upload them to S3, so that Amazon SageMaker training can use it.
"""
# load files to S3
# The S3 bucket, prefix, and file name that you want to use for training and model data
output_bucket = 'linkedin-sagemaker'
prefix = 'cleaned'
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'crime-data-cleaned.csv')).upload_file('crime-data-cleaned.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'train.csv')).upload_file('train.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'validation.csv')).upload_file('validation.csv')
boto3.resource('s3').Bucket(output_bucket).Object(os.path.join(prefix, 'test.csv')).upload_file('test.csv')