-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathcrawl-web
executable file
·393 lines (330 loc) · 14.6 KB
/
crawl-web
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
#!/usr/bin/env python2.7
###########
# IMPORTS #
###########
import sys
import argparse
import re
import os
from multiprocessing import Pool, Lock, Value, Process, Queue, JoinableQueue, Manager
import signal
from time import sleep
import cPickle as Pickle
from lxml.html import fromstring
from urlparse import urlparse, urljoin
import modules.curl as curl
####################
# GLOBAL VARIABLES #
####################
global target
global resolve
global cache
global request_max_attempt
global target_max_attempt
global rate_limit
global method
global ignore_status_code
global print_status_code
####################################
# PROCESS HELPER OBJECT + FUNCTION #
####################################
class Counter(object):
def __init__(self, initval=0):
self.val = Value('i', initval)
self.lock = Lock()
def increment(self):
with self.lock:
self.val.value += 1
def value(self):
with self.lock:
return self.val.value
def run(candidate_queue, visited_urls, target_attempt):
"""Ignore CTRL+C in the worker process."""
signal.signal(signal.SIGINT, signal.SIG_IGN)
while True:
candidate = candidate_queue.get()
if candidate is None:
# Poison pill means shutdown
candidate_queue.task_done()
break
candidates = crawl(candidate, visited_urls, target_attempt)
for new in candidates:
candidate_queue.put(new)
candidate_queue.task_done()
#############
# FUNCTIONS #
#############
def crawl(candidate, visited_urls, target_attempt):
# candidate format = http[s]://host:port/resource
candidates = set()
attempt = 0
while attempt < request_max_attempt and target_attempt.value() < target_max_attempt:
if rate_limit > 0:
sleep(rate_limit)
try:
# Explode target into usable components
url, scheme, host, _, port, resource, _ = curl.explode_target(candidate, None)
# Generate identifier for candidate
if resolve is not None and host.lower() == resolve.split(':')[0].lower():
ip_address = resolve.split(':')[1]
vid = "%s,%s://%s:%s%s" % (host, scheme, ip_address, port, resource)
else:
vid = ',%s' % (url)
# Check that we haven't performed this request before
if vid in visited_urls:
break
# Add to visited urls cache
visited_urls[vid] = 1
# Perform request
response = curl.request(method=method, url=url, resolve=resolve)
# Process response
if any(response.status_code != int(code.strip()) for code in ignore_status_code.split(',')):
# Extract links
links = extract_links(url, response)
# Remove links deemed out of scope - scheme, host, and port must match
links = purge_out_of_scope(links, scheme, host, port)
# Add new targets as candidates
for link in links:
candidates.add(link)
# Print
if any(response.status_code != int(code.strip()) for code in ignore_status_code.split(',')):
if print_status_code is None or any(response.status_code == int(code.strip()) for code in print_status_code.split(',')):
sys.stdout.write('%s,%s,%s,%s\n' % (method, response.status_code, response.content_length, vid))
sys.stdout.flush()
except:
#import traceback
#traceback.print_exc()
attempt += 1
if attempt == request_max_attempt:
sys.stderr.write("%s => Maximum error count reached for candidate, skipping.\n" % (url))
target_attempt.increment()
if target_attempt.value() >= target_max_attempt:
sys.stderr.write("%s => Maximum error count reached for target, skipping.\n" % (target))
continue
# break out of the loop
attempt = request_max_attempt
return candidates
def extract_links(parent_url, response):
unsanitised = set()
# Extract from location header
if 'location' in response.headers:
unsanitised.add(response.headers['location'])
# No point in carrying on if there is no body to parse
if response.text and response.text.strip() != '':
# Only proceed if we the response contains a html document
if 'content-type' in response.headers and 'text/html' in response.headers['content-type'].lower():
doc = fromstring(response.text)
# Extracts paths from "a" HTML elements.
for a in doc.xpath("//a"):
unsanitised.add(a.get("href"))
# Extracts paths from "area" HTML elements.
for area in doc.xpath("//area"):
unsanitised.add(area.get("href"))
# Extracts paths from "form" HTML elements.
for form in doc.xpath("//form"):
unsanitised.add(form.get("action"))
# Extracts paths from "frame" HTML elements.
for frame in doc.xpath("//frame"):
unsanitised.add(frame.get("src"))
# Extracts paths from "iframe" HTML elements.
for iframe in doc.xpath("//iframe"):
unsanitised.add(iframe.get("src"))
# Extracts paths from "link" HTML elements.
for links in doc.xpath("//link"):
unsanitised.add(links.get("href"))
# Extracts meta refresh URLs.
for r in doc.xpath('//html/head/meta[re:test(@http-equiv, "^refresh", "i")]',
namespaces={"re": "http://exslt.org/regular-expressions"}):
content = r.get("content")
if len(content.lower().split('url=', 1)) == 2:
url = content.lower().split('url=', 1)[1]
unsanitised.add(url.strip().replace('\'', '').replace('"', ''))
# Extracts paths from "data-url" attributes.
for match in re.finditer(r'''data-url\s*=\s*['"]?(.*?)?['"]?[\s>]''', response.text):
unsanitised.add(match.group(1))
# Extracts paths within "script" HTML elements.
for script in doc.xpath("//script"):
if script.text is None:
continue
for match in re.finditer(r'''([\/a-zA-Z0-9%._+-]+)''', script.text):
potential = match.group(0)
if '.' in potential and '/' in potential and '*' not in potential \
and not potential.startswith('//') and potential.startswith('/'):
unsanitised.add(potential)
# Extracts paths within "comments".
for comment in doc.xpath("//comment()"):
if comment.text is None:
continue
for match in re.finditer(r'''(^|\s)(\/[\/a-zA-Z0-9%._+-]+)''', comment.text):
potential = match.group(0).strip()
if potential.startswith('/'):
unsanitised.add(potential)
# Extracts images.
for img in doc.xpath("//img"):
unsanitised.add(img.get("src"))
# Extracts objects.
for obj in doc.xpath("//object/embed"):
unsanitised.add(obj.get("src"))
# Extracts scripts.
for script in doc.xpath("//script"):
unsanitised.add(script.get("src"))
# Only proceed if we the response contains a javascript script
#if 'content-type' in response.headers and 'application/javascript' in response.headers['content-type'].lower():
# for match in re.finditer(r'''([\/a-zA-Z0-9%._+-]+)''', response.text):
# potential = match.group(0)
# if '.' in potential and '/' in potential and '*' not in potential \
# and not potential.startswith('//') and potential.startswith('/'):
# unsanitised.add(potential)
sanitised = set()
for href in unsanitised:
if href is None \
or href.strip() == '' \
or href.lower().startswith('mailto:') \
or href.lower().startswith('javascript:') \
or href.lower().startswith('#') \
or href.startswith('..') \
or href.startswith('/..'):
continue
if href.startswith('http://') or href.startswith('https://'):
u_parse = urlparse(href, allow_fragments=False)
sanitised.add(u_parse.geturl())
else:
sanitised.add(urljoin(parent_url, href, allow_fragments=False))
return sanitised
def purge_out_of_scope(links, scheme, host, port):
tmp = list()
for link in links:
l_url, l_scheme, l_host, _, l_port, _, _ = curl.explode_target(link, None)
# scheme, host, and port need to match the source target
if scheme != l_scheme or host != l_host or port != l_port:
continue
# do not crawl a page with a blacklisted extension
if any((l_url.endswith(ext.strip()) for ext in ignore_extensions.split(','))):
continue
tmp.append(link)
return tmp
def load_cache():
if cache is None or not os.path.exists(cache):
# The cache doesn't exist, return an empty set
return set()
return Pickle.load(open(cache, 'rb'))
def save_cache(urls):
if cache is None:
return
if not os.path.exists(os.path.dirname(cache)):
os.makedirs(os.path.dirname(cache))
with open(cache, 'wb') as cache_file:
# Write it to the result to the file as a pickled object
# Use the binary protocol for better performance
Pickle.dump(urls, cache_file, protocol=1)
########
# MAIN #
########
if __name__ == '__main__':
desc = 'Identify resources hosted by the target web server by crawling linked resources.'
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-v', '--vhost',
action='store',
help='virtual host',
metavar='HOST',
default=None)
parser.add_argument('-c', '--cache',
action='store',
help='cache file to store visited URLs between invocations',
metavar='FILE',
default=None)
parser.add_argument('--ignore-extensions',
action='store',
help='files with extensions that should be not be requested (default: .exe,.zip,.tar,.bz2,.run,.asc,.gz,.bin,.iso,.dmg,.xz,.pdf,.docx,.doc,.pptx,.ppt)',
metavar='EXT',
default='.exe,.zip,.tar,.bz2,.run,.asc,.gz,.bin,.iso,.dmg,.xz,.pdf,.docx,.doc,.pptx,.ppt')
parser.add_argument('--ignore-status-code',
action='store',
help='do not crawl responses that return a matching status code (default: 404)',
metavar='CODE',
default='404')
parser.add_argument('-p', '--print-status_code',
action='store',
help='only print responses with a matching status code (default: all)',
metavar='CODE',
default=None)
parser.add_argument('-m', '--method',
action='store',
help='HTTP method to use for requests (default: GET)',
metavar='METHOD',
default='GET')
parser.add_argument('--rate-limit',
action='store',
help='wait NUM seconds between each request (default: 0)',
metavar='NUM',
default=0)
parser.add_argument('--request-max-attempt',
action='store',
help='skip request after NUM failed connection attempts (default: 3)',
metavar='NUM',
default=3)
parser.add_argument('--target-max-attempt',
action='store',
help='stop crawl after NUM failed request max attempts (default: 2)',
metavar='NUM',
default=2)
#('ignore_regex', None, False, 'skip resources that match regex'),
#('allow_regex', None, False, 'crawl only resources that match regex'),
#('crawl_depth', '5', True, 'skip resources that are NUM directories deep, -1 for unlimited'),
#('crawl_limit', '-1', True, 'stop crawl after NUM resources have been retrieved, -1 for unlimited'),
required = parser.add_argument_group('required arguments')
required.add_argument('-t', '--target',
action='store',
help='target web server URL (http[s]://address:port)',
metavar='URL',
required=True)
args = parser.parse_args()
global target
target = args.target
global cache
cache = args.cache
global request_max_attempt
request_max_attempt = args.request_max_attempt
global target_max_attempt
target_max_attempt = args.target_max_attempt
global rate_limit
rate_limit = args.rate_limit
global method
method = args.method
global ignore_extensions
ignore_extensions = args.ignore_extensions
global ignore_status_code
ignore_status_code = args.ignore_status_code
global print_status_code
print_status_code = args.print_status_code
# Explode target into useful components
global resolve
url, _, _, _, _, _, resolve = curl.explode_target(args.target, args.vhost)
manager = Manager()
visited_urls = manager.dict()
urls = load_cache()
for url in urls:
visited_urls[url] = 1
target_attempt = Counter(0)
candidate_queue = JoinableQueue()
num_processes = 10
processes = list()
for _ in range(num_processes):
p = Process(target=run, args=(candidate_queue,visited_urls,target_attempt,))
processes.append(p)
p.start()
# Enqueue first job
candidate_queue.put(url)
try:
# Wait for all of the tasks to finish
candidate_queue.join()
# Send poison pill to each worker process
for i in xrange(num_processes):
candidate_queue.put(None)
# Wait for poison pills to take effect
candidate_queue.join()
# Save visited urls to disk cache
save_cache(visited_urls.keys())
except KeyboardInterrupt:
for p in processes:
p.terminate()