-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
59 lines (50 loc) · 3 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Preprocessing Data"""
import sys
import pandas as pd
def preprocess(input_file, output_file):
df = pd.read_csv(str(input_file))
# 1(i) Removing Quotes
df_temp = df.copy(deep = True)
df[['race', 'race_o', 'field']] = df[['race', 'race_o', 'field']].applymap(lambda x: x.strip("'"))
count_of_changed_cells = df.count().sum() - (df == df_temp).astype(int).sum().sum()
print(f'Quotes removed from {count_of_changed_cells} cells.')
del df_temp
# 1(ii) Lowercasing
df_temp2 = df.copy(deep = True)
df['field'] = df['field'].apply(lambda x: x.lower())
count_of_changed_cells = df['field'].count().sum() - (df['field'] == df_temp2['field']).astype(int).sum().sum()
print(f'Standardized {count_of_changed_cells} cells to lower case.')
del df_temp2
#1(iii) Categorical attributes
categories = {}
for attribute in ['gender', 'race', 'race_o', 'field']:
df[attribute] = df[attribute].astype("category")
categories[attribute] = df[attribute].cat.categories
df[attribute] = df[attribute].cat.codes
value_for_male = categories['gender'].get_loc('male')
value_for_european_caucasian_american = categories['race'].get_loc('European/Caucasian-American')
value_for_latino_hispanic_american = categories['race_o'].get_loc('Latino/Hispanic American')
value_for_law = categories['field'].get_loc('law')
print(f'Value assigned for male in column gender: {value_for_male}.')
print(f'Value assigned for European/Caucasian-American in column race: {value_for_european_caucasian_american}.')
print(f'Value assigned for Latino/Hispanic American in column race_o: {value_for_latino_hispanic_american}.')
print(f'Value assigned for law in column field: {value_for_law}.')
#1(iv) Normalization
preference_scores_of_participant = ['attractive_important', 'sincere_important', 'intelligence_important', \
'funny_important', 'ambition_important', 'shared_interests_important']
preference_scores_of_partner = ['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', \
'pref_o_ambitious', 'pref_o_shared_interests']
preference_scores_of_participant_sum = df[preference_scores_of_participant].sum(axis = 1)
preference_scores_of_partner_sum = df[preference_scores_of_partner].sum(axis = 1)
df[preference_scores_of_participant] = df[preference_scores_of_participant].div(preference_scores_of_participant_sum, axis = 0)
df[preference_scores_of_partner] = df[preference_scores_of_partner].div(preference_scores_of_partner_sum, axis = 0)
for attribute in preference_scores_of_participant:
mean_of_attribute = round(df[attribute].mean(), 2)
print(f'Mean of {attribute}: {mean_of_attribute:.2f}')
for attribute in preference_scores_of_partner:
mean_of_attribute = round(df[attribute].mean(), 2)
print(f'Mean of {attribute}: {mean_of_attribute:.2f}')
# Output file
df.to_csv(str(output_file), index=False)
if __name__ == '__main__':
preprocess(sys.argv[1], sys.argv[2])