crawl-web

#!/usr/bin/env python2.7

###########
# IMPORTS #
###########

import sys
import argparse
import re
import os
from multiprocessing import Pool, Lock, Value, Process, Queue, JoinableQueue, Manager
import signal
from time import sleep
import cPickle as Pickle
from lxml.html import fromstring
from urlparse import urlparse, urljoin
import modules.curl as curl


####################
# GLOBAL VARIABLES #
####################

global target
global resolve
global cache
global request_max_attempt
global target_max_attempt
global rate_limit
global method
global ignore_status_code
global print_status_code


####################################
# PROCESS HELPER OBJECT + FUNCTION #
####################################

class Counter(object):
    def __init__(self, initval=0):
        self.val = Value('i', initval)
        self.lock = Lock()

    def increment(self):
        with self.lock:
            self.val.value += 1

    def value(self):
        with self.lock:
            return self.val.value


def run(candidate_queue, visited_urls, target_attempt):
    """Ignore CTRL+C in the worker process."""
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    while True:
        candidate = candidate_queue.get()
        if candidate is None:
            # Poison pill means shutdown
            candidate_queue.task_done()
            break
        candidates = crawl(candidate, visited_urls, target_attempt)
        for new in candidates:
            candidate_queue.put(new)
        candidate_queue.task_done()


#############
# FUNCTIONS #
#############

def crawl(candidate, visited_urls, target_attempt):
    # candidate format = http[s]://host:port/resource
    candidates = set()
    attempt = 0
    while attempt < request_max_attempt and target_attempt.value() < target_max_attempt:
        if rate_limit > 0:
            sleep(rate_limit)
        try:
            # Explode target into usable components
            url, scheme, host, _, port, resource, _ = curl.explode_target(candidate, None)

            # Generate identifier for candidate
            if resolve is not None and host.lower() == resolve.split(':')[0].lower():
                ip_address = resolve.split(':')[1]
                vid = "%s,%s://%s:%s%s" % (host, scheme, ip_address, port, resource)
            else:
                vid = ',%s' % (url)

            # Check that we haven't performed this request before
            if vid in visited_urls:
                break

            # Add to visited urls cache
            visited_urls[vid] = 1

            # Perform request
            response = curl.request(method=method, url=url, resolve=resolve)

            # Process response
            if any(response.status_code != int(code.strip()) for code in ignore_status_code.split(',')):
                # Extract links
                links = extract_links(url, response)
                # Remove links deemed out of scope - scheme, host, and port must match
                links = purge_out_of_scope(links, scheme, host, port)
                # Add new targets as candidates
                for link in links:
                    candidates.add(link)

            # Print
            if any(response.status_code != int(code.strip()) for code in ignore_status_code.split(',')):
                if print_status_code is None or any(response.status_code == int(code.strip()) for code in print_status_code.split(',')):
                    sys.stdout.write('%s,%s,%s,%s\n' % (method, response.status_code, response.content_length, vid))
                    sys.stdout.flush()
        except:
            #import traceback
            #traceback.print_exc()
            attempt += 1
            if attempt == request_max_attempt:
                sys.stderr.write("%s => Maximum error count reached for candidate, skipping.\n" % (url))
                target_attempt.increment()
                if target_attempt.value() >= target_max_attempt:
                    sys.stderr.write("%s => Maximum error count reached for target, skipping.\n" % (target))
            continue
        # break out of the loop
        attempt = request_max_attempt
    return candidates

def extract_links(parent_url, response):
    unsanitised = set()

    # Extract from location header
    if 'location' in response.headers:
        unsanitised.add(response.headers['location'])

    # No point in carrying on if there is no body to parse
    if response.text and response.text.strip() != '':
        # Only proceed if we the response contains a html document
        if 'content-type' in response.headers and 'text/html' in response.headers['content-type'].lower():
            doc = fromstring(response.text)

            # Extracts paths from "a" HTML elements.
            for a in doc.xpath("//a"):
                unsanitised.add(a.get("href"))

            # Extracts paths from "area" HTML elements.
            for area in doc.xpath("//area"):
                unsanitised.add(area.get("href"))

            # Extracts paths from "form" HTML elements.
            for form in doc.xpath("//form"):
                unsanitised.add(form.get("action"))

            # Extracts paths from "frame" HTML elements.
            for frame in doc.xpath("//frame"):
                unsanitised.add(frame.get("src"))

            # Extracts paths from "iframe" HTML elements.
            for iframe in doc.xpath("//iframe"):
                unsanitised.add(iframe.get("src"))

            # Extracts paths from "link" HTML elements.
            for links in doc.xpath("//link"):
                unsanitised.add(links.get("href"))

            # Extracts meta refresh URLs.
            for r in doc.xpath('//html/head/meta[re:test(@http-equiv, "^refresh", "i")]',
                               namespaces={"re": "http://exslt.org/regular-expressions"}):
                content = r.get("content")
                if len(content.lower().split('url=', 1)) == 2:
                    url = content.lower().split('url=', 1)[1]
                    unsanitised.add(url.strip().replace('\'', '').replace('"', ''))

            # Extracts paths from "data-url" attributes.
            for match in re.finditer(r'''data-url\s*=\s*['"]?(.*?)?['"]?[\s>]''', response.text):
                unsanitised.add(match.group(1))

            # Extracts paths within "script" HTML elements.
            for script in doc.xpath("//script"):
                if script.text is None:
                    continue
                for match in re.finditer(r'''([\/a-zA-Z0-9%._+-]+)''', script.text):
                    potential = match.group(0)
                    if '.' in potential and '/' in potential and '*' not in potential \
                            and not potential.startswith('//') and potential.startswith('/'):
                        unsanitised.add(potential)

            # Extracts paths within "comments".
            for comment in doc.xpath("//comment()"):
                if comment.text is None:
                    continue
                for match in re.finditer(r'''(^|\s)(\/[\/a-zA-Z0-9%._+-]+)''', comment.text):
                    potential = match.group(0).strip()
                    if potential.startswith('/'):
                        unsanitised.add(potential)

            # Extracts images.
            for img in doc.xpath("//img"):
                unsanitised.add(img.get("src"))

            # Extracts objects.
            for obj in doc.xpath("//object/embed"):
                unsanitised.add(obj.get("src"))

            # Extracts scripts.
            for script in doc.xpath("//script"):
                unsanitised.add(script.get("src"))

        # Only proceed if we the response contains a javascript script
        #if 'content-type' in response.headers and 'application/javascript' in response.headers['content-type'].lower():
        #    for match in re.finditer(r'''([\/a-zA-Z0-9%._+-]+)''', response.text):
        #        potential = match.group(0)
        #        if '.' in potential and '/' in potential and '*' not in potential \
        #                and not potential.startswith('//') and potential.startswith('/'):
        #            unsanitised.add(potential)

    sanitised = set()
    for href in unsanitised:
        if href is None \
                or href.strip() == '' \
                or href.lower().startswith('mailto:') \
                or href.lower().startswith('javascript:') \
                or href.lower().startswith('#') \
                or href.startswith('..') \
                or href.startswith('/..'):
            continue

        if href.startswith('http://') or href.startswith('https://'):
            u_parse = urlparse(href, allow_fragments=False)
            sanitised.add(u_parse.geturl())
        else:
            sanitised.add(urljoin(parent_url, href, allow_fragments=False))
    return sanitised

def purge_out_of_scope(links, scheme, host, port):
    tmp = list()
    for link in links:
        l_url, l_scheme, l_host, _, l_port, _, _ = curl.explode_target(link, None)
        # scheme, host, and port need to match the source target
        if scheme != l_scheme or host != l_host or port != l_port:
            continue
        # do not crawl a page with a blacklisted extension
        if any((l_url.endswith(ext.strip()) for ext in ignore_extensions.split(','))):
            continue
        tmp.append(link)
    return tmp

def load_cache():
    if cache is None or not os.path.exists(cache):
        # The cache doesn't exist, return an empty set
        return set()
    return Pickle.load(open(cache, 'rb'))

def save_cache(urls):
    if cache is None:
        return
    if not os.path.exists(os.path.dirname(cache)):
        os.makedirs(os.path.dirname(cache))
    with open(cache, 'wb') as cache_file:
        # Write it to the result to the file as a pickled object
        # Use the binary protocol for better performance
        Pickle.dump(urls, cache_file, protocol=1)


########
# MAIN #
########

if __name__ == '__main__':
    desc = 'Identify resources hosted by the target web server by crawling linked resources.'

    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument('-v', '--vhost',
                        action='store',
                        help='virtual host',
                        metavar='HOST',
                        default=None)
    parser.add_argument('-c', '--cache',
                        action='store',
                        help='cache file to store visited URLs between invocations',
                        metavar='FILE',
                        default=None)
    parser.add_argument('--ignore-extensions',
                        action='store',
                        help='files with extensions that should be not be requested (default: .exe,.zip,.tar,.bz2,.run,.asc,.gz,.bin,.iso,.dmg,.xz,.pdf,.docx,.doc,.pptx,.ppt)',
                        metavar='EXT',
                        default='.exe,.zip,.tar,.bz2,.run,.asc,.gz,.bin,.iso,.dmg,.xz,.pdf,.docx,.doc,.pptx,.ppt')
    parser.add_argument('--ignore-status-code',
                        action='store',
                        help='do not crawl responses that return a matching status code (default: 404)',
                        metavar='CODE',
                        default='404')
    parser.add_argument('-p', '--print-status_code',
                        action='store',
                        help='only print responses with a matching status code (default: all)',
                        metavar='CODE',
                        default=None)
    parser.add_argument('-m', '--method',
                          action='store',
                          help='HTTP method to use for requests (default: GET)',
                          metavar='METHOD',
                          default='GET')
    parser.add_argument('--rate-limit',
                          action='store',
                          help='wait NUM seconds between each request (default: 0)',
                          metavar='NUM',
                          default=0)
    parser.add_argument('--request-max-attempt',
                          action='store',
                          help='skip request after NUM failed connection attempts (default: 3)',
                          metavar='NUM',
                          default=3)
    parser.add_argument('--target-max-attempt',
                          action='store',
                          help='stop crawl after NUM failed request max attempts (default: 2)',
                          metavar='NUM',
                          default=2)
    #('ignore_regex', None, False, 'skip resources that match regex'),
    #('allow_regex', None, False, 'crawl only resources that match regex'),
    #('crawl_depth', '5', True, 'skip resources that are NUM directories deep, -1 for unlimited'),
    #('crawl_limit', '-1', True, 'stop crawl after NUM resources have been retrieved, -1 for unlimited'),
    required = parser.add_argument_group('required arguments')
    required.add_argument('-t', '--target',
                          action='store',
                          help='target web server URL (http[s]://address:port)',
                          metavar='URL',
                          required=True)
    args = parser.parse_args()

    global target
    target = args.target

    global cache
    cache = args.cache

    global request_max_attempt
    request_max_attempt = args.request_max_attempt

    global target_max_attempt
    target_max_attempt = args.target_max_attempt

    global rate_limit
    rate_limit = args.rate_limit

    global method
    method = args.method

    global ignore_extensions
    ignore_extensions = args.ignore_extensions

    global ignore_status_code
    ignore_status_code = args.ignore_status_code

    global print_status_code
    print_status_code = args.print_status_code

    # Explode target into useful components
    global resolve
    url, _, _, _, _, _, resolve = curl.explode_target(args.target, args.vhost)

    manager = Manager()
    visited_urls = manager.dict()
    urls = load_cache()
    for url in urls:
        visited_urls[url] = 1

    target_attempt = Counter(0)
    candidate_queue = JoinableQueue()
    num_processes = 10
    processes = list()

    for _ in range(num_processes):
        p = Process(target=run, args=(candidate_queue,visited_urls,target_attempt,))
        processes.append(p)
        p.start()

    # Enqueue first job
    candidate_queue.put(url)

    try:
        # Wait for all of the tasks to finish
        candidate_queue.join()
        # Send poison pill to each worker process
        for i in xrange(num_processes):
            candidate_queue.put(None)
        # Wait for poison pills to take effect
        candidate_queue.join()
        # Save visited urls to disk cache
        save_cache(visited_urls.keys())
    except KeyboardInterrupt:
        for p in processes:
            p.terminate()