-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdummyPy.py
158 lines (141 loc) · 5.77 KB
/
dummyPy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from collections import defaultdict
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
class Encoder():
"""
Helper class to encode levels of a categorical Variable.
"""
def __init__(self):
self.column_mapper = None
def fit(self, levels):
"""
Parameters
----------
levels: set
Unique levels of the categorical variable.
"""
self.column_mapper = {x:i for i,x in enumerate(sorted(levels))}
def transform(self, column_data):
"""
Parameters
----------
columns_data: pandas Series object
"""
row_cols = [(i, self.column_mapper[x])
for i,x in enumerate(column_data) if x in self.column_mapper]
data = np.ones(len(row_cols))
return(coo_matrix((data, zip(*row_cols)),
shape=(column_data.shape[0], len(self.column_mapper))))
class OneHotEncoder():
"""
A One Hot Encoder class that converts the categorical variables in a data frame
to one hot encoded variables. It can also handle large data that is too big to fit
in the memory by reading the data in chunks.
Example
-------
The following example uses the kaggle's titanic data. It can be found here -
`https://www.kaggle.com/c/titanic/data`
This data is only 60 KB and it has been used for a demonstration purpose.
This class also works well with datasets too large to fit into the machine
memory.
>>> from dummyPy import OneHotEncoder
>>> import pandas as pd
>>> encoder = OneHotEncoder(categorical_columns=["Pclass", "Sex", "Embarked"])
>>> data = pd.read_csv("titanic.csv", usecols=["Pclass", "Sex", "Age", "Fare", "Embarked"])
>>> data.shape
(891, 5)
>>> encoder.fit(data)
>>> X = encoder.transform(data)
>>> X.shape
(891, 11)
>>> X
array([[0.0, 0.0, 1.0, ..., 0.0, 0.0, 1.0],
[1.0, 0.0, 0.0, ..., 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, ..., 0.0, 0.0, 1.0],
...,
[0.0, 0.0, 1.0, ..., 0.0, 0.0, 1.0],
[1.0, 0.0, 0.0, ..., 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, ..., 0.0, 1.0, 0.0]], dtype=object)
>>> chunked_data = pd.read_csv("titanic.csv",
usecols=["Pclass", "Sex", "Age", "Fare", "Embarked"],
chunksize=10)
>>> encoder2 = OneHotEncoder(categorical_columns=["Pclass", "Sex", "Embarked"])
>>> encoder2.fit(chunked_data)
>>> X = encoder2.transform(data)
>>> X.shape
(891, 11)
>>> X
array([[0.0, 0.0, 1.0, ..., 0.0, 0.0, 1.0],
[1.0, 0.0, 0.0, ..., 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, ..., 0.0, 0.0, 1.0],
...,
[0.0, 0.0, 1.0, ..., 0.0, 0.0, 1.0],
[1.0, 0.0, 0.0, ..., 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, ..., 0.0, 1.0, 0.0]], dtype=object)
"""
def __init__(self, categorical_columns):
"""
Parameters
----------
categorical_columns: list
A list of the names of the categorical varibales in the data. All these columns
must have dtype as string.
"""
self.categorical_columns = categorical_columns
self.unique_vals = defaultdict(set)
self.encoders = {column_name: Encoder() for column_name in categorical_columns}
def _update_unique_vals(self, data):
for column_name in self.categorical_columns:
for value in data[column_name]:
self.unique_vals[column_name].add(value)
def _fit_encoders(self):
for column_name in self.categorical_columns:
self.encoders[column_name].fit(self.unique_vals[column_name])
def fit(self, data):
"""
This method reads the categorical columns and gets the necessary
one hot encoded column shapes.
It can also read the data in chunks.
Parameters
----------
data: pandas.core.frame.DataFrame or pandas.io.parsers.TextFileReader
The data can be either a pandas data frame or a pandas TextFileReader
object. The TextFileReader object is created by specifying the
chunksize parameter in pandas read_csv method.
Use the TextFileReader object as input if the dataset is too large to
fit in the machine memory.
"""
if(isinstance(data, pd.core.frame.DataFrame)):
self._update_unique_vals(data)
else:
for data_chunk in data:
self._update_unique_vals(data_chunk)
self._fit_encoders()
def transform(self, data):
"""
This method is used to convert the categorical values in your data into
one hot encoded vectors. It convets the categorical columns in the data
to one hot encoded columns and leaves the continuous variable columns as it is.
Parameters
----------
data: pandas data frame
The data frame object that needs to be transformed.
"""
transformed_data = [self.encoders[column_name].transform(data[column_name]).toarray()
if column_name in self.categorical_columns
else data[column_name].values.reshape(-1, 1)
for column_name in data.columns]
return(np.array(np.concatenate(transformed_data, axis=1), dtype=object))
def fit_transform(self, data):
"""
This method calls fit and transform one after the other.
Please note that unlike the fit method the fit_transform method
can take only the pandas data frame as input.
Parameters
----------
data: pandas.core.frame.DataFrame
A pandas data frame.
"""
self.fit(data)
return(self.transform(data))