This repository was archived by the owner on May 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
118 lines (105 loc) · 4.46 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import requests
import nltk
import re
class ESInterface(object):
def __init__(self, url, username, password):
""" Initializes object, creates ES connection """
# Create the ES connection
self.conn = Elasticsearch(url, verify_certs=True, http_auth=(username, password))
self.index_name = 'digitalrecords-search'
self.doc_type = 'modelresult'
# Query to get records that have keywords
self.query = {
'query': {
'exists': {
'field': 'keywords'
}
},
'_source': ['id', 'keywords'],
'size': 1000
}
# Get the total number of matching records
self.get_total()
def get_total(self):
""" Gets the total number of records that match the query """
query = self.query
query['size'] = 1
search = self.conn.search(body=query, index=self.index_name)
self.total = search['hits']['total']
def get_records(self, test_run=False):
print('Total matches: %s' % self.total)
results = []
records = scan(self.conn, index=self.index_name, query=self.query, doc_type=self.doc_type)
if test_run:
records_num = 50000
print('Total records to fetch: %s (TEST RUN)' % records_num)
else:
records_num = self.total
print('Total records to fetch: %s' % records_num)
print('\nNow fetching records from ES...')
for counter, record in enumerate(records, 1):
if counter > records_num:
break
else:
results += [record['_source']]
if counter % 10000 == 0:
print(' - Fetched %s records out of %s' % (counter, records_num))
print('Finished fetching records from ES\n')
return results
class SearchInterface(object):
def __init__(self):
self.base_url = 'https://digitalrecords.showthething.com/api/search/v0/records/'
self.get_total()
def get_total(self):
""" Gets the total number of records that match the query """
resp = self._fetch_batch(0, limit=1)
data = resp.json()
self.total = data['hits']['count']
def _fetch_batch(self, offset, limit=1000):
params = {'limit': limit, 'offset': offset, 'exists': 'keywords'}
resp = requests.get(self.base_url, params=params)
return resp
def get_records(self, test_run=False):
print('Total matches: %s' % self.total)
if test_run:
records_num = 10000
print('Total records to fetch: %s (TEST RUN)' % records_num)
else:
records_num = self.total
print('Total records to fetch: %s' % records_num)
results = []
batches = range(0, self.total, 1000)
print('\nNow fetching records from ES...')
for batch_offset in batches:
fetch_count = batch_offset + 1000
if fetch_count <= records_num:
resp = self._fetch_batch(batch_offset)
if resp.status_code == 200:
data = resp.json()
results += data['hits']['results']
else:
raise Exception('Error fetching records from the server!')
print(' - Fetched %s records out of %s' % (fetch_count, records_num))
else:
break
print('Finished fetching records from ES\n')
return results
def tokenize(text):
# get the english stopwords
nltk.download('stopwords', quiet=True)
stopwords = nltk.corpus.stopwords.words('english')
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
nltk.download('punkt', quiet=True)
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.wordpunct_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
# include only those that contains letters
if re.search('[a-zA-Z]', token):
# exclude stop words, those shorter than 3 characters, and those that
# start with non-alphanumeric characters
if token not in stopwords and len(token) > 2 and token[0].isalnum():
filtered_tokens.append(token)
return filtered_tokens