-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSkeleton_Code_for_Logistics_and_Analytics.py
211 lines (165 loc) · 6.35 KB
/
Skeleton_Code_for_Logistics_and_Analytics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
##Libraries:
import os # Paths to file
import numpy as np # Linear Algebra
import pandas as pd # Data Processing
import warnings # Warning Filter
# Ploting Libraries:
import matplotlib.pyplot as plt
import seaborn as sns
# Relevant ML Libraries:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
# ML Models:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
##File Path:
# List all files under the input directory:
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# Path for the training set:
tr_path = # TBD Later
# Path for the testing set:
te_path = # TBD Later
##Preprocessing and Data Analysis :
# Training Set:
# Read a csv file as a DataFrame:
tr_df = pd.read_csv(tr_path)
# Explore the first 5 rows:
tr_df.head()
# Testing Set:
# Read a csv file as a DataFrame:
te_df = pd.read_csv(te_path)
# Explore the first 5 rows:
te_df.head()
# Size of each data set:
print(f"training set (row, col): {tr_df.shape}\n\ntesting set (row, col): {te_df.shape}")
# Column Information:
tr_df.info(verbose=True, null_counts=True)
##Data visalization:
'''We need to split our data to categorical and numerical data,
using the `.select_dtypes('dtype').columns.to_list()` combination.'''
##Region Demand Distribution:
# List of all the numeric columns:
num = tr_df.select_dtypes('number').columns.to_list()
# List of all the categoric columns:
cat = tr_df.select_dtypes('object').columns.to_list()
# Numeric df:
region_num = tr_df[num]
# Categoric df:
region_cat = tr_df[cat]
print(tr_df[cat[-1]].value_counts())
total = float(len(tr_df[cat[-1]]))
plt.figure(figsize=(8,10))
sns.set(style="whitegrid")
ax = sns.countplot(tr_df[cat[-1]])
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,height + 3,'{:1.2f}'.format(height/total),ha="center")
plt.show()
##Let's plot our data:
#Numeric:
for i in region_num:
plt.hist(region_num[i])
plt.title(i)
plt.show()
# Categorical (split by Region Demand):
for i in cat[:-1]:
plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
sns.countplot(x=i ,hue='Region_Status', data=tr_df ,palette='plasma')
plt.xlabel(i, fontsize=14)
## Encoding data to numeric:
# Adding the new numeric values from the to_numeric variable to both datasets:
tr_df = tr_df.applymap(lambda lable: to_numeric.get(lable) if lable in to_numeric else lable)
te_df = te_df.applymap(lambda lable: to_numeric.get(lable) if lable in to_numeric else lable
# Checking our manipulated dataset for validation
print(f"training set (row, col): {tr_df.shape}\n\ntesting set (row, col): {te_df.shape}\n")
print(tr_df.info(), "\n\n", te_df.info())
# Plotting the Correlation Matrix:
sns.heatmap(tr_df.corr() ,cmap='cubehelix_r')
# Correlation Table:
corr = tr_df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)
'''
## First of all we will divide our dataset into two variables `X` as the features we
defined earlier and `y` as the `Region_Demand` the target value we want to predict.
## Machine Learning Models we will use:
* **Decision Tree**
* **Random Forest**
* **XGBoost**
* **Logistic Regression**
## The Process of Modeling the Data:
1. Importing the model
2. Fitting the model
3. Predicting Region Demand
4. Classification report by Region Demand
5. Overall accuracy
'''
y = tr_df['Region Demand']
X = tr_df.drop('Region Demand', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
##Decision Tree :
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
y_predict = DT.predict(X_test)
# Prediction Summary by species :
print(classification_report(y_test, y_predict))
# Accuracy Score :
DT_SC = accuracy_score(y_predict,y_test)
print(f"{round(DT_SC*100,2)}% Accurate")
# Csv results of the test for our model :
# (You can see each predition and true value side by side by the csv created in the output directory.)
Decision_Tree=pd.DataFrame({'y_test':y_test,'prediction':y_predict})
Decision_Tree.to_csv("Dection Tree.csv")
##Random Forest :
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
y_predict = RF.predict(X_test)
# Prediction Summary by species:
print(classification_report(y_test, y_predict))
# Accuracy score:
RF_SC = accuracy_score(y_predict,y_test)
print(f"{round(RF_SC*100,2)}% Accurate")
# Csv results of the test for our model:
# (You can see each predition and true value side by side by the csv created in the output directory.)
Random_Forest=pd.DataFrame({'y_test':y_test,'prediction':y_predict})
Random_Forest.to_csv("Random Forest.csv")
##XGBoost:
XGB = XGBClassifier()
XGB.fit(X_train, y_train)
y_predict = XGB.predict(X_test)
# Prediction Summary by species:
print(classification_report(y_test, y_predict))
# Accuracy Score:
XGB_SC = accuracy_score(y_predict,y_test)
print(f"{round(XGB_SC*100,2)}% Accurate")
#Csv results of the test for our model:
#(You can see each predition and true value side by side by the csv created in the output directory.)
XGBoost=pd.DataFrame({'y_test':y_test,'prediction':y_predict})
XGBoost.to_csv("XGBoost.csv")
##Logistic Regression:
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_predict = LR.predict(X_test)
# Prediction Summary by species:
print(classification_report(y_test, y_predict))
# Accuracy Score:
LR_SC = accuracy_score(y_predict,y_test)
print('accuracy is',accuracy_score(y_predict,y_test))
Logistic_Regression=pd.DataFrame({'y_test':y_test,'prediction':y_predict})
Logistic_Regression.to_csv("Logistic Regression.csv")
'''
Conclusion:
`Demand History` is a very important variable because of its high correlation with `Region Demand` therefore showing high Dependancy for the latter.
'''
score = [DT_SC,RF_SC,XGB_SC,LR_SC]
Models = pd.DataFrame({
'n_neighbors': ["Decision Tree","Random Forest","XGBoost", "Logistic Regression"],
'Score': score})
Models.sort_values(by='Score', ascending=False)