-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
184 lines (153 loc) · 7.23 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python
# coding: utf-8
# ## Spotify User Data- A Dashboard
#importing libaries
import pandas as pd
import numpy as np
from math import pi
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure,gmap
from bokeh.models import ColumnDataSource,LabelSet,GMapOptions,BasicTicker, ColorBar, LinearColorMapper, PrintfTickFormatter
from bokeh.palettes import GnBu3,Category20c
from bokeh.plotting import figure, show
from bokeh.transform import cumsum,transform,jitter
from bokeh.layouts import gridplot, grid
import io
from bokeh.embed import components
# from bokeh.models import HoverTool
from bokeh.models import LinearAxis, Range1d
from bokeh.resources import CDN
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn, Div
from jinja2 import Template
import datetime as dt
#output_notebook()
output_file('dash.html')
#read data
df=pd.read_table('users.tsv')
df.dropna(inplace=True)
#convert to datetime
df['date'] = pd.to_datetime(df.registered_unixtime,unit='s')
#remove data after '2012-06-30' due to sparsity
df=df[df.date<'2012-06-30']
#make age bins
bins = [df.age.min(),0, 10, 20, 30, 40, 50, 60, df.age.max()+1]
df['age_bin']= pd.cut(df['age'], bins,right=False)
df['age_bin']=df.age_bin.astype(str)
#extract year
df['year']=df.date.dt.year
df['year']=df['year'].astype(str)
#pivot to unstack gender
dfp=df.pivot_table(values='age',index='age_bin',columns='gender',aggfunc='count').reset_index()
#plot1
p1 = figure(title='Males between 20-30 are top Users',y_range=['[-1, 0)', '[0, 10)', '[10, 20)', '[20, 30)', '[30, 40)',
'[40, 50)', '[50, 60)', '[60, 113)'],plot_width=600, plot_height=350,toolbar_location=None)
p1.hbar_stack(['f','m','n'], y='age_bin', height=0.8,color=GnBu3,source=ColumnDataSource(dfp.to_dict('list')),
legend_label=[x for x in ['female','male','unspecified']])
p1.y_range.range_padding = 0.0
p1.ygrid.grid_line_color = None
p1.legend.location = "top_right"
p1.axis.minor_tick_line_color = None
p1.outline_line_color = None
p1.xaxis.axis_label = 'Count of Users'
p1.yaxis.axis_label = 'Age'
#
# show(p1)
#make start and end angles for donut
genders=df.groupby('gender').count().user_id.reset_index()
genders['gender']=['Females','Males','Unspecified']
genders['angle'] = genders['user_id']/genders['user_id'].sum() * 2*pi
genders['color'] = Category20c[len(genders['user_id'])]
#plot 2
p2 = figure(title='Nearly 66% of the Users are Males',plot_height=350,plot_width=600, toolbar_location=None)
p2.annular_wedge(x=0, y=0, inner_radius=0.2, outer_radius=0.4,
start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
line_color="white", fill_color='color',legend_field='gender',source=genders)
p2.axis.visible=False
p2.grid.grid_line_color = None
# show(p2)
#plot 3
hist, edges = np.histogram(df.playcount.values, bins=[0,10000,20000,30000,40000,50000,60000,70000,80000,90000,100000,2000000])
p3 = figure(plot_height=350,plot_width=600,title='Most (~24000) Users listened to about 10000 tracks', tools='', background_fill_color="#fafafa")
p3.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="navy", line_color="white", alpha=0.5)
p3.y_range.start = 0
p3.x_range.start=0
p3.x_range.end=100000
p3.xaxis.axis_label = 'Play Count'
p3.yaxis.axis_label = 'Users'
p3.grid.grid_line_color="white"
# show(p3)
#plot 4
temp_df=df.groupby(df['date'].dt.to_period('Q'))['user_id'].agg('count').reset_index()
p4 = figure(title="Sign-Up increased exponentially towards end of study period", x_axis_type="datetime", plot_height = 350, plot_width = 600)
p4.xaxis.axis_label = 'Date of Registration'
p4.yaxis.axis_label = 'No of Registrations'
p4.varea(temp_df.date, temp_df.user_id,0,fill_color="purple")
# show(p4)
#plot 5
temp=df[['date']]
temp['day'] = temp[['date']].apply(lambda x: dt.datetime.strftime(x['date'], '%A'), axis=1)
temp['time']=temp.date.dt.time
temp.set_index('date',inplace=True)
DAYS = ['Sunday', 'Saturday', 'Friday', 'Thursday', 'Wednesday', 'Tuesday', 'Monday']
#take only 5000 for clearer chart
source = ColumnDataSource(temp.tail(5000))
p5 = figure(y_range=DAYS, x_axis_type='datetime', plot_height = 350, plot_width = 600,
title="Most Users Sign Up between 3 PM and 8 PM, Weekends don't make a difference")
p5.circle(x='time', y=jitter('day', width=0.7, range=p5.y_range), source=source, alpha=.75,size=3,fill_color='cyan',line_color='blue')
p5.xaxis[0].formatter.days = ['Hour %H']
p5.x_range.range_padding = 0
p5.ygrid.grid_line_color = None
# show(p5)
#stack and keep only high userbase countries
temp=df.groupby(['country','year']).count()['user_id'].reset_index()
temp2=temp.pivot_table(values='user_id',columns='year',index='country').fillna(0)
temp2=temp2[temp2.sum(axis=1)>1000]
temp2.reset_index(inplace=True)
temp2 = temp2.set_index('country')
# data.drop('Annual', axis=1, inplace=True)
temp2.columns.name = 'Year'
# reshape to 1D array or rates with a month and year for each row.
dft = pd.DataFrame(temp2.stack(), columns=['rate']).reset_index()
#plot 6
source = ColumnDataSource(dft)
colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LinearColorMapper(palette=colors, low=dft.rate.min(), high=dft.rate.max())
p6 = figure(plot_width=600, plot_height=350, title="Highest Sign-Up Rate is seen in the US and Russia towards end of study period",
x_range=list(temp2.index), y_range=list(reversed(temp2.columns)),
toolbar_location=None, tools="", x_axis_location="above")
p6.rect(x="country", y="Year", width=1, height=1, source=source,
line_color=None, fill_color=transform('rate', mapper))
color_bar = ColorBar(color_mapper=mapper, location=(0, 0),
ticker=BasicTicker(desired_num_ticks=len(colors)),
formatter=PrintfTickFormatter(format="%d%%"))
p6.add_layout(color_bar, 'right')
p6.axis.axis_line_color = None
p6.axis.major_tick_line_color = None
# p6.axis.major_label_text_font_size = "10pt"
p6.axis.major_label_standoff = 0
p6.xaxis.major_label_orientation = 1.0
# show(p6)
#Data Snippet
source = ColumnDataSource(df[['user_id', 'country', 'age', 'gender', 'playcount','date']].tail(10))
columns = [
TableColumn(field="user_id", title="User ID",width=20),
TableColumn(field="country", title="Country",width=20),
TableColumn(field="age", title="Age",width=20),
TableColumn(field="gender", title="Gender",width=20),
TableColumn(field="playcount", title="Play Count",width=20),
TableColumn(field="date", title="Registration", formatter=DateFormatter(format="%m/%d/%Y %H:%M:%S"),width=40)
]
data_table = DataTable(source=source, columns=columns, height=350)
# show(data_table)
#Page Headings
div = Div(text="""<h1><center>An Overview of Spotify Users Data</center></h1><br/>
<h3><center>This dashboard gives a bird-eye view of the users who registered for Spotify between 2002 and 2012</center></h3>""",
width=1500, height=150,style={"color":"grey", "font-family":"cambria"})
# show(div)
div2 = Div(text="""<h3><center>Below is a snapshot of the Data</center></h3>""",
width=1500, height=50,style={"color":"grey", "font-family":"cambria"})
# show(div)
#arrange all plots in grid
l = grid([[div],[p1, p5,p6], [p3, p4,p2],[div2],[data_table]])
show(l)