Skip to content

Commit

Permalink
fix tests: httpbun.org → .com (#455)
Browse files Browse the repository at this point in the history
* fix tests: httpbun.org → .com

* uncomment tests

* fix test

* restore original
  • Loading branch information
adbar authored Dec 13, 2023
1 parent aabfdec commit 05a73de
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 43 deletions.
16 changes: 8 additions & 8 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def test_download():
#teststring = fetch_url(url)
#assert teststring is not None
#assert cli.examine(teststring, args, url) is None
url = 'https://httpbun.org/html'
url = 'https://httpbun.com/html'
teststring = fetch_url(url)
assert teststring is not None
assert cli.examine(teststring, args, url) is not None
Expand Down Expand Up @@ -408,27 +408,27 @@ def test_crawling():
args = cli.parse_args(testargs)
cli_utils.cli_crawler(args)

testargs = ['', '--crawl', 'https://httpbun.org/html']
testargs = ['', '--crawl', 'https://httpbun.com/html']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
assert f.getvalue() == 'https://httpbun.org/html\n'
assert f.getvalue() == 'https://httpbun.com/html\n'

spider.URL_STORE = UrlStore(compressed=False, strict=False)
# links permitted
testargs = ['', '--crawl', 'https://httpbun.org/links/1/1', '--list', '--parallel', '1']
testargs = ['', '--crawl', 'https://httpbun.com/links/1/1', '--list', '--parallel', '1']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
# possibly a bug on Github actions, should be 2 URLs
assert f.getvalue() in ('https://httpbun.org/links/1/1\nhttps://httpbun.org/links/1/0\n', 'https://httpbun.org/links/1/1\n')
assert f.getvalue() in ('https://httpbun.com/links/1/1\nhttps://httpbun.com/links/1/0\n', 'https://httpbun.com/links/1/1\n')
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# 0 links permitted
args.crawl = 'https://httpbun.org/links/4/4'
args.crawl = 'https://httpbun.com/links/4/4'
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args, n=0)
Expand All @@ -437,13 +437,13 @@ def test_crawling():
spider.URL_STORE = UrlStore(compressed=False, strict=False)

# Exploration (Sitemap + Crawl)
testargs = ['', '--explore', 'https://httpbun.org/html', '--list']
testargs = ['', '--explore', 'https://httpbun.com/html', '--list']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert f.getvalue().strip() == 'https://httpbun.org/html'
assert f.getvalue().strip() == 'https://httpbun.com/html'


def test_probing():
Expand Down
18 changes: 9 additions & 9 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,27 +53,27 @@ def test_fetch():
assert _send_request('', True, DEFAULT_CONFIG) is None

# is_live general tests
assert _urllib3_is_live_page('https://httpbun.org/status/301') is True
assert _urllib3_is_live_page('https://httpbun.org/status/404') is False
assert is_live_page('https://httpbun.org/status/403') is False
assert _urllib3_is_live_page('https://httpbun.com/status/301') is True
assert _urllib3_is_live_page('https://httpbun.com/status/404') is False
assert is_live_page('https://httpbun.com/status/403') is False
# is_live pycurl tests
if pycurl is not None:
assert _pycurl_is_live_page('https://httpbun.org/status/301') is True
assert _pycurl_is_live_page('https://httpbun.com/status/301') is True

# fetch_url
assert fetch_url('#@1234') is None
assert fetch_url('https://httpbun.org/status/404') is None
assert fetch_url('https://httpbun.com/status/404') is None
# test if the functions default to no_ssl
# doesn't work?
# assert _send_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
if pycurl is not None:
assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
# no SSL, no decoding
url = 'https://httpbun.org/status/200'
response = _send_request('https://httpbun.org/status/200', True, DEFAULT_CONFIG)
url = 'https://httpbun.com/status/200'
response = _send_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
assert response.data == b''
if pycurl is not None:
response1 = _send_pycurl_request('https://httpbun.org/status/200', True, DEFAULT_CONFIG)
response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG)
assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG)
# response object
Expand Down Expand Up @@ -155,7 +155,7 @@ def test_queue():
testargs = ['', '-v']
with patch.object(sys, 'argv', testargs):
args = parse_args(testargs)
inputurls = ['https://httpbun.org/status/301', 'https://httpbun.org/status/304', 'https://httpbun.org/status/200', 'https://httpbun.org/status/300', 'https://httpbun.org/status/400', 'https://httpbun.org/status/505']
inputurls = ['https://httpbun.com/status/301', 'https://httpbun.com/status/304', 'https://httpbun.com/status/200', 'https://httpbun.com/status/300', 'https://httpbun.com/status/400', 'https://httpbun.com/status/505']
url_store = add_to_compressed_dict(inputurls)
args.archived = True
args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
Expand Down
4 changes: 2 additions & 2 deletions tests/feeds_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def test_feeds_helpers():
) == ["https://example.org/rss"]
# feed discovery
assert not find_feed_urls("http://")
assert not find_feed_urls("https://httpbun.org/status/404")
assert not find_feed_urls("https://httpbun.com/status/404")
# Feedburner/Google links
assert handle_link_list(["https://feedproxy.google.com/ABCD"], params) == [
"https://feedproxy.google.com/ABCD"
Expand All @@ -271,7 +271,7 @@ def test_feeds_helpers():

def test_cli_behavior():
"""Test command-line interface with respect to feeds"""
testargs = ["", "--list", "--feed", "https://httpbun.org/xml"]
testargs = ["", "--list", "--feed", "https://httpbun.com/xml"]
with patch.object(sys, "argv", testargs):
assert main() is None

Expand Down
2 changes: 1 addition & 1 deletion tests/sitemaps_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def test_extraction():
def test_robotstxt():
'''Check if sitemaps can be found over robots.txt'''
assert not sitemaps.find_robots_sitemaps('https://http.org')
baseurl = 'https://httpbun.org'
baseurl = 'https://httpbun.com'
assert not sitemaps.find_robots_sitemaps(baseurl)
assert not sitemaps.extract_robots_sitemaps('# test', baseurl)
assert not sitemaps.extract_robots_sitemaps('# test'*10000, baseurl)
Expand Down
46 changes: 23 additions & 23 deletions tests/spider_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,40 +27,40 @@ def test_redirections():
"Test redirection detection."
_, _, baseurl = spider.probe_alternative_homepage('xyz')
assert baseurl is None
_, _, baseurl = spider.probe_alternative_homepage('https://httpbun.org/redirect-to?url=https://example.org')
_, _, baseurl = spider.probe_alternative_homepage('https://httpbun.com/redirect-to?url=https://example.org')
assert baseurl == 'https://example.org'
#_, _, baseurl = spider.probe_alternative_homepage('https://httpbin.org/redirect-to?url=https%3A%2F%2Fhttpbin.org%2Fhtml&status_code=302')


def test_meta_redirections():
"Test redirection detection using meta tag."
# empty
htmlstring, homepage = '"refresh"', 'https://httpbun.org/'
htmlstring, homepage = '"refresh"', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage
htmlstring, homepage = '<html></html>', 'https://httpbun.org/'
htmlstring, homepage = '<html></html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage

# unusable
htmlstring, homepage = '<html>REDIRECT!</html>', 'https://httpbun.org/'
htmlstring, homepage = '<html>REDIRECT!</html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage

# malformed
htmlstring, homepage = '<html><meta http-equiv="refresh" content="3600\n&lt;meta http-equiv=" content-type=""></html>', 'https://httpbun.org/'
htmlstring, homepage = '<html><meta http-equiv="refresh" content="3600\n&lt;meta http-equiv=" content-type=""></html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 == htmlstring and homepage2 == homepage

# wrong URL
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=1234"/></html>', 'https://httpbun.org/'
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=1234"/></html>', 'https://httpbun.com/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 is None and homepage2 is None

# normal
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=https://httpbun.org/html"/></html>', 'http://test.org/'
htmlstring, homepage = '<html><meta http-equiv="refresh" content="0; url=https://httpbun.com/html"/></html>', 'http://test.org/'
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert htmlstring2 is not None and homepage2 == 'https://httpbun.org/html'
assert htmlstring2 is not None and homepage2 == 'https://httpbun.com/html'


def test_process_links():
Expand Down Expand Up @@ -103,7 +103,7 @@ def test_process_links():

def test_crawl_logic():
"Test functions related to crawling sequence and consistency."
url = 'https://httpbun.org/html'
url = 'https://httpbun.com/html'
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# erroneous webpage
with pytest.raises(ValueError):
Expand All @@ -118,31 +118,31 @@ def test_crawl_logic():
base_url, i, known_num, rules, is_on = spider.init_crawl(url, None, None)
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert todo == [] and known_links == [url,] and base_url == 'https://httpbun.org' and i == 1
assert todo == [] and known_links == [url,] and base_url == 'https://httpbun.com' and i == 1
# delay between requests
assert spider.URL_STORE.get_crawl_delay('https://httpbun.org') == 5
assert spider.URL_STORE.get_crawl_delay('https://httpbun.org', default=2.0) == 2.0
assert spider.URL_STORE.get_crawl_delay('https://httpbun.com') == 5
assert spider.URL_STORE.get_crawl_delay('https://httpbun.com', default=2.0) == 2.0
# existing todo
spider.URL_STORE = UrlStore(compressed=False, strict=False)
base_url, i, known_num, rules, is_on = spider.init_crawl(url, [url,], None)
assert base_url == 'https://httpbun.org' and i == 0
assert base_url == 'https://httpbun.com' and i == 0


def test_crawl_page():
"Test page-by-page processing."
base_url = 'https://httpbun.org'
base_url = 'https://httpbun.com'
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(['https://httpbun.org/links/2/2'])
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.org')
spider.URL_STORE.add_urls(['https://httpbun.com/links/2/2'])
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.com')
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert sorted(todo) == ['https://httpbun.org/links/2/0', 'https://httpbun.org/links/2/1']
assert sorted(todo) == ['https://httpbun.com/links/2/0', 'https://httpbun.com/links/2/1']
assert len(known_links) == 3 and visited_num == 1
# initial page
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(['https://httpbun.org/html'])
spider.URL_STORE.add_urls(['https://httpbun.com/html'])
# if LANGID_FLAG is True:
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.org', initial=True, lang='de')
is_on, known_num, visited_num = spider.crawl_page(0, 'https://httpbun.com', initial=True, lang='de')
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert len(todo) == 0 and len(known_links) == 1 and visited_num == 1
Expand All @@ -152,10 +152,10 @@ def test_crawl_page():
def test_focused_crawler():
"Test the whole focused crawler mechanism."
spider.URL_STORE = UrlStore()
todo, known_links = spider.focused_crawler("https://httpbun.org/links/1/1", max_seen_urls=1)
## TODO: check this on Github actions:
# assert sorted(known_links) == ['https://httpbun.org/links/1/0', 'https://httpbun.org/links/1/1']
# assert sorted(todo) == ['https://httpbun.org/links/1/0']
todo, known_links = spider.focused_crawler("https://httpbun.com/links/1/1", max_seen_urls=1)
## fails on Github Actions
## assert sorted(known_links) == ['https://httpbun.com/links/1/0', 'https://httpbun.com/links/1/1']
## assert sorted(todo) == ['https://httpbun.com/links/1/0']


if __name__ == '__main__':
Expand Down

0 comments on commit 05a73de

Please sign in to comment.