-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path__init__.py
278 lines (232 loc) · 9.05 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import datasette
import html
import glob
from jinja2 import FunctionLoader
import re
import markupsafe
import os
import json
from .hookspecs import hookimpl
from .config import enabled_databases, ensure_schema
from .plugin import pm
from .routes import get_routes
from .workers import start_workers
from .utils import module_from_path
from collections import namedtuple
ConfigSchema = namedtuple('ConfigSchema', ['schema', 'uischema', 'key', 'group', 'sort'], defaults=(None, 1000))
class JsonString:
def __init__(self, obj):
self.obj = obj
def __html__(self):
return json.dumps(self.obj)
@datasette.hookimpl
def startup(datasette):
async def inner():
enabled_databases = config.enabled_databases(datasette)
for db_name in enabled_databases:
await ensure_schema(datasette.databases[db_name])
if enabled_databases:
# TODO: this isn't a documented surface area, so it's a bit skeezy to be using it
if datasette.plugins_dir:
for filepath in glob.glob(os.path.join(datasette.plugins_dir, "*.py")):
if not os.path.isfile(filepath):
continue
mod = module_from_path(filepath, name=os.path.basename(filepath))
try:
pm.register(mod)
except ValueError:
# Plugin already registered
pass
start_workers(datasette)
return inner
class MyFunctionLoader(FunctionLoader):
def list_templates(self):
return []
@datasette.hookimpl
def prepare_jinja2_environment(env, datasette):
def load_func(path):
try:
if path.startswith('table-') and path.endswith('-dss_crawl.html'):
path = 'table-dss_crawl.html'
elif path.startswith('table-') and path.endswith('-dss_job.html'):
path = 'table-dss_job.html'
elif path.startswith('table-') and path.endswith('-dss_host_rate_limit.html'):
path = 'table-dss_host_rate_limit.html'
elif path.startswith('table-') and path.endswith('-dss_job_stats.html'):
path = 'table-dss_job_stats.html'
elif path.startswith('table-') and path.endswith('-dss_extract_stats.html'):
path = 'table-dss_extract_stats.html'
elif path.startswith('table-') and path.endswith('-dss_zstd_dict.html'):
path = 'table-dss_zstd_dict.html'
elif path.startswith('row-') and path.endswith('-dss_crawl.html'):
path = 'row-dss_crawl.html'
elif path.startswith('row-') and path.endswith('-dss_host_rate_limit.html'):
path = 'row-dss_host_rate_limit.html'
template_path = os.path.abspath(os.path.join(__file__, '..', 'templates', path))
f = open(template_path, 'r')
contents = f.read()
f.close()
# TODO: this should return True in prod, I think? Although maybe
# just a performance improvement, so it's fine.
return contents, path, lambda: False
except FileNotFoundError:
return None
env.loader.loaders.insert(0, MyFunctionLoader(load_func))
@datasette.hookimpl
def get_metadata(datasette, key, database, table):
rv = {
'databases': {}
}
# There's a weird circular dependency issue here.
# When we called enabled_databases, it'll check plugin
# configuration to see if it's enabled.
#
# Which means it's not safe for _us_ to call enabled_databases
# until _after_ it has initialized.
for db_name in enabled_databases(datasette, empty_if_not_initialized=True):
rv['databases'][db_name] = {
'tables': {
'dss_crawl_queue': {
'sort_desc': 'id',
'hidden': True,
},
'dss_crawl_queue_history': {
'sort_desc': 'processed_at',
'hidden': True,
},
'dss_extract_stats': {
'hidden': True,
},
'dss_fetch_cache': {
'sort_desc': 'fetched_at',
'hidden': True,
},
'dss_host_rate_limit': {
'hidden': True,
},
'dss_job': {
'sort_desc': 'id',
'hidden': True,
},
'dss_job_stats': {
'sort_desc': 'job_id',
'hidden': True,
},
'dss_ops': {
'hidden': True,
},
'dss_zstd_dict': {
'hidden': True,
},
}
}
return rv
@datasette.hookimpl
def extra_template_vars(datasette, request):
"""Add dss_schema, dss_default_config, dss_id variables."""
# TODO: this is pretty janky! It would be nice to add these only
# on our templates.
known_groups = {
'Seeds': 1,
'Links': 2,
'Caching': 5,
'Extracting': 6,
'Other': 7,
}
groups = {}
async def extra_vars():
schema = {
'type': 'object',
'properties': {
}
};
schema['properties']['name'] = {
'type': 'string',
'minLength': 1
}
default_config = {'name': ''}
for plugin in pm.get_plugins():
if not 'config_schema' in dir(plugin):
continue
rv = plugin.config_schema()
schema['properties'][rv.key] = rv.schema
group = rv.group
if not group in known_groups:
group = 'Other'
groups[group] = groups.get(group, [])
groups[group].append((rv.uischema, rv.sort))
if 'config_default_value' in dir(plugin):
default_config[rv.key] = plugin.config_default_value()
id = 0
default_config['name'] = 'Crawl name'
category_schemas = []
for key in sorted(groups.keys(), key=lambda x: known_groups[x]):
elements = sorted(groups[key], key=lambda x: x[1])
elements = [x[0] for x in elements]
category_schemas.append({
'type': 'Category',
'label': key,
'elements': elements
})
uischema = {
"type": "VerticalLayout",
"elements": [
{
"type": "Control",
"scope": "#/properties/name",
},
{
"type": "Categorization",
"elements": category_schemas,
}
]
}
dss_db = None
m = re.search('^/([^/]+)/', request.path)
if m:
dss_db = m.group(1)
m = re.search('/([^/]+)/dss_crawl/([0-9]+)$', request.path)
if m:
db_name = m.group(1)
db = datasette.databases[db_name]
id = int(m.group(2))
rv = await db.execute('SELECT name, config FROM dss_crawl WHERE id = ?', [id])
for row in rv:
config = json.loads(row['config'])
config['name'] = row['name']
default_config = config
return {
"dss_schema": JsonString(schema),
"dss_uischema": JsonString(uischema),
"dss_default_config": JsonString(default_config),
"dss_db": dss_db,
"dss_id": id
}
return extra_vars
@datasette.hookimpl
def render_cell(database, row, table, column, value):
if table != 'dss_job_stats' and table != 'dss_extract_stats':
return None
def link(label, href):
return markupsafe.Markup(
'<a href="{href}">{label}</a>'.format(
href=markupsafe.escape(href),
label=markupsafe.escape(label)
)
)
if table == 'dss_job_stats':
if column == 'host':
return link(value, '/{}/dss_crawl_queue_history?job_id__exact={}&host__exact={}&_sort_desc=processed_at'.format(database, row['job_id']['value'], row['host']))
elif column == 'fetched_fresh':
return link(value, '/{}/dss_crawl_queue_history?job_id__exact={}&host__exact={}&fetched_fresh__exact=1&_sort_desc=processed_at'.format(database, row['job_id']['value'], row['host']))
elif column.startswith('fetched_') and column.endswith('xx'):
start = 100 * int(column[8])
return link(value, '/{}/dss_crawl_queue_history?job_id__exact={}&host__exact={}&status_code__gte={}&status_code__lte={}&_sort_desc=processed_at'.format(database, row['job_id']['value'], row['host'], start, start + 99))
if table == 'dss_extract_stats':
if column == 'database':
return link(value, '/{}'.format(row['database']))
if column == 'tbl':
return link(value, '/{}/{}'.format(row['database'], row['tbl']))
@datasette.hookimpl
def register_routes(datasette):
return get_routes(datasette)