Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: Mark Crawler tests correctly #4435

Merged
merged 1 commit into from
Mar 16, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 18 additions & 26 deletions test/nodes/test_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
import hashlib
import os
from unittest.mock import patch

import pytest

Expand Down Expand Up @@ -54,9 +55,12 @@ def content_in_results(crawler: Crawler, url: str, results: List[Path], expected
return sum(content_match(crawler, url, path) for path in results) == expected_matches_count


#
# Integration
#
@pytest.mark.unit
@patch("haystack.nodes.connector.crawler.webdriver")
def test_crawler_url_none_exception(webdriver):
crawler = Crawler()
with pytest.raises(ValueError):
crawler.crawl()


@pytest.mark.integration
Expand All @@ -82,27 +86,15 @@ def test_crawler(tmp_path):
assert file_content["content"] == document.content


#
# Unit tests
#


@pytest.mark.unit
def test_crawler_url_none_exception(tmp_path):
crawler = Crawler()
with pytest.raises(ValueError):
crawler.crawl()


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_depth_0_single_url(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path, crawler_depth=0, file_path_meta_field_name="file_path")
documents = crawler.crawl(urls=[test_url + "/index.html"])
assert len(documents) == 1
assert content_match(crawler, test_url + "/index.html", documents[0].meta["file_path"])


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_depth_0_many_urls(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
_urls = [test_url + "/index.html", test_url + "/page1.html"]
Expand All @@ -113,7 +105,7 @@ def test_crawler_depth_0_many_urls(test_url, tmp_path):
assert content_in_results(crawler, test_url + "/page1.html", paths)


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_depth_1_single_url(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=1)
Expand All @@ -124,7 +116,7 @@ def test_crawler_depth_1_single_url(test_url, tmp_path):
assert content_in_results(crawler, test_url + "/page2.html", paths)


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_output_file_structure(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
Expand All @@ -139,7 +131,7 @@ def test_crawler_output_file_structure(test_url, tmp_path):
assert len(data["content"].split()) > 2


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_filter_urls(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")

Expand All @@ -154,7 +146,7 @@ def test_crawler_filter_urls(test_url, tmp_path):
assert not crawler.crawl(urls=[test_url + "/index.html"], filter_urls=["google.com"], crawler_depth=1)


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_extract_hidden_text(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path)
documents, _ = crawler.run(urls=[test_url + "/page_w_hidden_text.html"], extract_hidden_text=True, crawler_depth=0)
Expand All @@ -166,7 +158,7 @@ def test_crawler_extract_hidden_text(test_url, tmp_path):
assert "hidden text" not in crawled_content


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_loading_wait_time(test_url, tmp_path):
loading_wait_time = 3
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")
Expand Down Expand Up @@ -197,7 +189,7 @@ def test_crawler_loading_wait_time(test_url, tmp_path):
assert content_in_results(crawler, test_url + "/page2.html", paths)


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_default_naming_function(test_url, tmp_path):
crawler = Crawler(output_dir=tmp_path, file_path_meta_field_name="file_path")

Expand All @@ -213,7 +205,7 @@ def test_crawler_default_naming_function(test_url, tmp_path):
assert path == Path(expected_crawled_file_path)


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_naming_function(test_url, tmp_path):
crawler = Crawler(
output_dir=tmp_path,
Expand All @@ -231,14 +223,14 @@ def test_crawler_naming_function(test_url, tmp_path):
assert path == expected_crawled_file_path


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_not_save_file(test_url):
crawler = Crawler()
documents = crawler.crawl(urls=[test_url + "/index.html"], crawler_depth=0)
assert documents[0].meta.get("file_path", None) is None


@pytest.mark.unit
@pytest.mark.integration
def test_crawler_custom_meta_file_path_name(test_url, tmp_path):
crawler = Crawler()
documents = crawler.crawl(
Expand Down