From 35c0511fa9917e653df50cb95a22105b397e14c0 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Wed, 28 Dec 2022 08:48:52 +0100 Subject: [PATCH] Always display test coverage; add tests (#240) * Set up a coverage config file `pytest` now displays coverage stats and `pytest --cov-report html` produces a useful coverage report * Always display the 10 slowest tests * Expanded on data.py tests * Wrote tests for exporting ONNX with Torch head However, these must be skipped currently, as they all fail! * Reformat test_onnx.py For some reason, locally I get different warnings for ./make quality than the CI does. * Rerun ./make style --- .coveragerc | 36 ++++++++++ setup.cfg | 6 +- tests/exporters/test_onnx.py | 125 +++++++++++++++++++++++++++-------- tests/test_data.py | 69 ++++++++++++++++++- 4 files changed, 208 insertions(+), 28 deletions(-) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..9026d1bc --- /dev/null +++ b/.coveragerc @@ -0,0 +1,36 @@ +# Configuration file to control (pytest) coverage +[run] +# Run branch coverage, too +branch = True + +[paths] +source = + src/setfit + +[report] +# Regexes for lines to exclude from consideration +exclude_lines = + # Have to re-enable the standard pragma + pragma: no cover + + # Don't complain about missing debug-only code: + def __repr__ + if self\.debug + + # Don't complain if tests don't hit defensive assertion code: + raise AssertionError + raise NotImplementedError + + # Don't complain if non-runnable code isn't run: + if 0: + if __name__ == .__main__.: + + # Don't complain about abstract methods, they aren't run: + @(abc\.)?abstractmethod + + # Ignore TYPE_CHECKING code + if TYPE_CHECKING: + +[html] +directory = coverage_report_html +title = SetFit coverage report \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 21f0dd77..36bd263e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,4 +16,8 @@ per-file-ignores = exclude = results scripts/adapet - scripts/tfew \ No newline at end of file + scripts/tfew + +[tool:pytest] +testpaths = tests +addopts = --cov=setfit --durations=10 \ No newline at end of file diff --git a/tests/exporters/test_onnx.py b/tests/exporters/test_onnx.py index 4f353439..c1778cf9 100644 --- a/tests/exporters/test_onnx.py +++ b/tests/exporters/test_onnx.py @@ -2,10 +2,14 @@ import numpy as np import onnxruntime +import pytest +from datasets import Dataset from transformers import AutoTokenizer from setfit import SetFitModel +from setfit.data import get_augmented_samples from setfit.exporters.onnx import export_onnx +from setfit.trainer import SetFitTrainer def test_export_onnx_sklearn_head(): @@ -15,34 +19,103 @@ def test_export_onnx_sklearn_head(): # Export the sklearn based model output_path = "model.onnx" - export_onnx(model.model_body, model.model_head, opset=12, output_path=output_path) - - # Check that the model was saved. - assert output_path in os.listdir(), "Model not saved to output_path" - - # Run inference using the original model. - input_text = ["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"] - pytorch_preds = model(input_text) - - # Run inference using the exported onnx model. - tokenizer = AutoTokenizer.from_pretrained(model_path) - inputs = tokenizer( - input_text, - padding=True, - truncation=True, - return_attention_mask=True, - return_token_type_ids=True, - return_tensors="np", + try: + export_onnx(model.model_body, model.model_head, opset=12, output_path=output_path) + + # Check that the model was saved. + assert output_path in os.listdir(), "Model not saved to output_path" + + # Run inference using the original model. + input_text = ["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"] + pytorch_preds = model(input_text) + + # Run inference using the exported onnx model. + tokenizer = AutoTokenizer.from_pretrained(model_path) + inputs = tokenizer( + input_text, + padding=True, + truncation=True, + return_attention_mask=True, + return_token_type_ids=True, + return_tensors="np", + ) + # Map inputs to int64 from int32 + inputs = {key: value.astype("int64") for key, value in inputs.items()} + + session = onnxruntime.InferenceSession(output_path) + + onnx_preds = session.run(None, dict(inputs))[0] + + # Compare the results and ensure that we get the same predictions. + assert np.array_equal(onnx_preds, pytorch_preds) + + finally: + # Cleanup the model. + os.remove(output_path) + + +@pytest.mark.skip("ONNX exporting of SetFit model with Torch head not yet supported.") +@pytest.mark.parametrize("out_features", [1, 2, 3]) +def test_export_onnx_torch_head(out_features): + """Test that the exported `ONNX` model returns the same predictions as the original model.""" + dataset = Dataset.from_dict(get_augmented_samples("SentEval-CR")) + model_path = "sentence-transformers/paraphrase-albert-small-v2" + model = SetFitModel.from_pretrained( + model_path, use_differentiable_head=True, head_params={"out_features": out_features} ) - # Map inputs to int64 from int32 - inputs = {key: value.astype("int64") for key, value in inputs.items()} - session = onnxruntime.InferenceSession(output_path) + trainer = SetFitTrainer( + model=model, + train_dataset=dataset, + eval_dataset=dataset, + num_iterations=15, + column_mapping={"text": "text", "label": "label"}, + ) + # Train and evaluate + trainer.freeze() # Freeze the head + trainer.train() # Train only the body + # Unfreeze the head and unfreeze the body -> end-to-end training + trainer.unfreeze(keep_body_frozen=False) + trainer.train( + num_epochs=15, + batch_size=16, + body_learning_rate=1e-5, + learning_rate=1e-2, + l2_weight=0.0, + ) + + # Export the sklearn based model + output_path = "model.onnx" + try: + export_onnx(model.model_body, model.model_head, opset=12, output_path=output_path) + + # Check that the model was saved. + assert output_path in os.listdir(), "Model not saved to output_path" + + # Run inference using the original model. + input_text = ["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"] + pytorch_preds = model(input_text) + + # Run inference using the exported onnx model. + tokenizer = AutoTokenizer.from_pretrained(model_path) + inputs = tokenizer( + input_text, + padding=True, + truncation=True, + return_attention_mask=True, + return_token_type_ids=True, + return_tensors="np", + ) + # Map inputs to int64 from int32 + inputs = {key: value.astype("int64") for key, value in inputs.items()} + + session = onnxruntime.InferenceSession(output_path) - onnx_preds = session.run(None, dict(inputs))[0] + onnx_preds = session.run(None, dict(inputs))[0] - # Compare the results and ensure that we get the same predictions. - assert np.array_equal(onnx_preds, pytorch_preds) + # Compare the results and ensure that we get the same predictions. + assert np.array_equal(onnx_preds, pytorch_preds) - # Cleanup the model. - os.remove(output_path) + finally: + # Cleanup the model. + os.remove(output_path) diff --git a/tests/test_data.py b/tests/test_data.py index d06bd6bc..c855356d 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,15 +1,18 @@ import string +import numpy as np import pandas as pd import pytest -from datasets import Dataset +from datasets import Dataset, load_dataset from setfit.data import ( SAMPLE_SIZES, SEEDS, add_templated_examples, create_fewshot_splits, + create_fewshot_splits_multilabel, create_samples, + get_augmented_samples, sample_dataset, ) @@ -102,9 +105,48 @@ def test_subset_is_smaller_than_sample_size(sample_size): def test_expected_number_of_splits(): dataset = Dataset.from_pandas(pd.DataFrame({"label": [0] * 50 + [1] * 50})) + num_labels = 2 splits_ds = create_fewshot_splits(dataset, SAMPLE_SIZES) assert len(splits_ds) == len(SAMPLE_SIZES) * len(SEEDS) + split: Dataset + for idx, split in enumerate(splits_ds.values()): + sample_size = SAMPLE_SIZES[idx // len(SEEDS)] + # The number of rows is limited by 100 due to the size of the original dataset + assert len(split) == min(sample_size * num_labels, len(dataset)) + + +def test_create_fewshot_splits_with_augmentation(): + dataset_name = "sst5" + dataset = load_dataset(f"SetFit/{dataset_name}", split="train") + num_labels = len(set(dataset["label"])) + splits_ds = create_fewshot_splits(dataset, SAMPLE_SIZES, add_data_augmentation=True, dataset_name=dataset_name) + assert len(splits_ds) == len(SAMPLE_SIZES) * len(SEEDS) + + split: Dataset + for idx, split in enumerate(splits_ds.values()): + sample_size = SAMPLE_SIZES[idx // len(SEEDS)] + # Each split should have sample_size * num_labels * 2 rows: + # for each label we sample `sample_size`, and then we generate + # another `sample_size` samples through augmentation. + assert len(split) == sample_size * num_labels * 2 + + +def test_create_fewshot_splits_multilabel(): + num_samples = 50 + dataset = Dataset.from_dict( + { + "text": string.ascii_letters[:50], + "label_one": np.random.randint(2, size=(num_samples,)), + "label_two": np.random.randint(2, size=(num_samples,)), + "label_three": np.random.randint(2, size=(num_samples,)), + } + ) + splits_ds = create_fewshot_splits_multilabel(dataset, SAMPLE_SIZES) + assert len(splits_ds) == len(SAMPLE_SIZES) * len(SEEDS) + # We can't safely test the number of rows of each of the splits + # as duplicate samples are removed. + def test_sample_dataset_returns_expected_samples(): num_samples = 2 @@ -130,3 +172,28 @@ def test_sample_dataset_with_unbalanced_ds(unbalanced_dataset): # has one label with more than `num_samples` entries and another label with just 1 row. # We sample `num_samples` from the former, and 1 from the latter. assert ds.num_rows == num_samples + 1 + + +@pytest.mark.parametrize( + "dataset", + [ + "emotion", + "ag_news", + "amazon_counterfactual_en", + "SentEval-CR", + "sst5", + "enron_spam", + "tweet_eval_stance_abortion", + "ade_corpus_v2_classification", + ], +) +def test_get_augmented_samples(dataset: str): + dataset_dict = get_augmented_samples(dataset) + assert set(dataset_dict.keys()) == {"text", "label"} + assert len(dataset_dict["text"]) + assert len(dataset_dict["label"]) + + +def test_get_augmented_samples_negative(): + with pytest.raises(ValueError): + get_augmented_samples(None)