From 35c0511fa9917e653df50cb95a22105b397e14c0 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Wed, 28 Dec 2022 08:48:52 +0100
Subject: [PATCH] Always display test coverage; add tests (#240)

* Set up a coverage config file

`pytest` now displays coverage stats and `pytest --cov-report html` produces a useful coverage report

* Always display the 10 slowest tests

* Expanded on data.py tests

* Wrote tests for exporting ONNX with Torch head

However, these must be skipped currently, as they all fail!

* Reformat test_onnx.py

For some reason, locally I get different warnings for ./make quality than the CI does.

* Rerun ./make style
---
 .coveragerc                  |  36 ++++++++++
 setup.cfg                    |   6 +-
 tests/exporters/test_onnx.py | 125 +++++++++++++++++++++++++++--------
 tests/test_data.py           |  69 ++++++++++++++++++-
 4 files changed, 208 insertions(+), 28 deletions(-)
 create mode 100644 .coveragerc

diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 00000000..9026d1bc
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,36 @@
+# Configuration file to control (pytest) coverage
+[run]
+# Run branch coverage, too
+branch = True
+
+[paths]
+source =
+    src/setfit
+
+[report]
+# Regexes for lines to exclude from consideration
+exclude_lines =
+    # Have to re-enable the standard pragma
+    pragma: no cover
+
+    # Don't complain about missing debug-only code:
+    def __repr__
+    if self\.debug
+
+    # Don't complain if tests don't hit defensive assertion code:
+    raise AssertionError
+    raise NotImplementedError
+
+    # Don't complain if non-runnable code isn't run:
+    if 0:
+    if __name__ == .__main__.:
+
+    # Don't complain about abstract methods, they aren't run:
+    @(abc\.)?abstractmethod
+
+    # Ignore TYPE_CHECKING code
+    if TYPE_CHECKING:
+
+[html]
+directory = coverage_report_html
+title = SetFit coverage report
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 21f0dd77..36bd263e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -16,4 +16,8 @@ per-file-ignores =
 exclude =
     results
     scripts/adapet
-    scripts/tfew
\ No newline at end of file
+    scripts/tfew
+
+[tool:pytest]
+testpaths = tests
+addopts = --cov=setfit --durations=10
\ No newline at end of file
diff --git a/tests/exporters/test_onnx.py b/tests/exporters/test_onnx.py
index 4f353439..c1778cf9 100644
--- a/tests/exporters/test_onnx.py
+++ b/tests/exporters/test_onnx.py
@@ -2,10 +2,14 @@
 
 import numpy as np
 import onnxruntime
+import pytest
+from datasets import Dataset
 from transformers import AutoTokenizer
 
 from setfit import SetFitModel
+from setfit.data import get_augmented_samples
 from setfit.exporters.onnx import export_onnx
+from setfit.trainer import SetFitTrainer
 
 
 def test_export_onnx_sklearn_head():
@@ -15,34 +19,103 @@ def test_export_onnx_sklearn_head():
 
     # Export the sklearn based model
     output_path = "model.onnx"
-    export_onnx(model.model_body, model.model_head, opset=12, output_path=output_path)
-
-    # Check that the model was saved.
-    assert output_path in os.listdir(), "Model not saved to output_path"
-
-    # Run inference using the original model.
-    input_text = ["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"]
-    pytorch_preds = model(input_text)
-
-    # Run inference using the exported onnx model.
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    inputs = tokenizer(
-        input_text,
-        padding=True,
-        truncation=True,
-        return_attention_mask=True,
-        return_token_type_ids=True,
-        return_tensors="np",
+    try:
+        export_onnx(model.model_body, model.model_head, opset=12, output_path=output_path)
+
+        # Check that the model was saved.
+        assert output_path in os.listdir(), "Model not saved to output_path"
+
+        # Run inference using the original model.
+        input_text = ["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"]
+        pytorch_preds = model(input_text)
+
+        # Run inference using the exported onnx model.
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        inputs = tokenizer(
+            input_text,
+            padding=True,
+            truncation=True,
+            return_attention_mask=True,
+            return_token_type_ids=True,
+            return_tensors="np",
+        )
+        # Map inputs to int64 from int32
+        inputs = {key: value.astype("int64") for key, value in inputs.items()}
+
+        session = onnxruntime.InferenceSession(output_path)
+
+        onnx_preds = session.run(None, dict(inputs))[0]
+
+        # Compare the results and ensure that we get the same predictions.
+        assert np.array_equal(onnx_preds, pytorch_preds)
+
+    finally:
+        # Cleanup the model.
+        os.remove(output_path)
+
+
+@pytest.mark.skip("ONNX exporting of SetFit model with Torch head not yet supported.")
+@pytest.mark.parametrize("out_features", [1, 2, 3])
+def test_export_onnx_torch_head(out_features):
+    """Test that the exported `ONNX` model returns the same predictions as the original model."""
+    dataset = Dataset.from_dict(get_augmented_samples("SentEval-CR"))
+    model_path = "sentence-transformers/paraphrase-albert-small-v2"
+    model = SetFitModel.from_pretrained(
+        model_path, use_differentiable_head=True, head_params={"out_features": out_features}
     )
-    # Map inputs to int64 from int32
-    inputs = {key: value.astype("int64") for key, value in inputs.items()}
 
-    session = onnxruntime.InferenceSession(output_path)
+    trainer = SetFitTrainer(
+        model=model,
+        train_dataset=dataset,
+        eval_dataset=dataset,
+        num_iterations=15,
+        column_mapping={"text": "text", "label": "label"},
+    )
+    # Train and evaluate
+    trainer.freeze()  # Freeze the head
+    trainer.train()  # Train only the body
+    # Unfreeze the head and unfreeze the body -> end-to-end training
+    trainer.unfreeze(keep_body_frozen=False)
+    trainer.train(
+        num_epochs=15,
+        batch_size=16,
+        body_learning_rate=1e-5,
+        learning_rate=1e-2,
+        l2_weight=0.0,
+    )
+
+    # Export the sklearn based model
+    output_path = "model.onnx"
+    try:
+        export_onnx(model.model_body, model.model_head, opset=12, output_path=output_path)
+
+        # Check that the model was saved.
+        assert output_path in os.listdir(), "Model not saved to output_path"
+
+        # Run inference using the original model.
+        input_text = ["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"]
+        pytorch_preds = model(input_text)
+
+        # Run inference using the exported onnx model.
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        inputs = tokenizer(
+            input_text,
+            padding=True,
+            truncation=True,
+            return_attention_mask=True,
+            return_token_type_ids=True,
+            return_tensors="np",
+        )
+        # Map inputs to int64 from int32
+        inputs = {key: value.astype("int64") for key, value in inputs.items()}
+
+        session = onnxruntime.InferenceSession(output_path)
 
-    onnx_preds = session.run(None, dict(inputs))[0]
+        onnx_preds = session.run(None, dict(inputs))[0]
 
-    # Compare the results and ensure that we get the same predictions.
-    assert np.array_equal(onnx_preds, pytorch_preds)
+        # Compare the results and ensure that we get the same predictions.
+        assert np.array_equal(onnx_preds, pytorch_preds)
 
-    # Cleanup the model.
-    os.remove(output_path)
+    finally:
+        # Cleanup the model.
+        os.remove(output_path)
diff --git a/tests/test_data.py b/tests/test_data.py
index d06bd6bc..c855356d 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -1,15 +1,18 @@
 import string
 
+import numpy as np
 import pandas as pd
 import pytest
-from datasets import Dataset
+from datasets import Dataset, load_dataset
 
 from setfit.data import (
     SAMPLE_SIZES,
     SEEDS,
     add_templated_examples,
     create_fewshot_splits,
+    create_fewshot_splits_multilabel,
     create_samples,
+    get_augmented_samples,
     sample_dataset,
 )
 
@@ -102,9 +105,48 @@ def test_subset_is_smaller_than_sample_size(sample_size):
 
 def test_expected_number_of_splits():
     dataset = Dataset.from_pandas(pd.DataFrame({"label": [0] * 50 + [1] * 50}))
+    num_labels = 2
     splits_ds = create_fewshot_splits(dataset, SAMPLE_SIZES)
     assert len(splits_ds) == len(SAMPLE_SIZES) * len(SEEDS)
 
+    split: Dataset
+    for idx, split in enumerate(splits_ds.values()):
+        sample_size = SAMPLE_SIZES[idx // len(SEEDS)]
+        # The number of rows is limited by 100 due to the size of the original dataset
+        assert len(split) == min(sample_size * num_labels, len(dataset))
+
+
+def test_create_fewshot_splits_with_augmentation():
+    dataset_name = "sst5"
+    dataset = load_dataset(f"SetFit/{dataset_name}", split="train")
+    num_labels = len(set(dataset["label"]))
+    splits_ds = create_fewshot_splits(dataset, SAMPLE_SIZES, add_data_augmentation=True, dataset_name=dataset_name)
+    assert len(splits_ds) == len(SAMPLE_SIZES) * len(SEEDS)
+
+    split: Dataset
+    for idx, split in enumerate(splits_ds.values()):
+        sample_size = SAMPLE_SIZES[idx // len(SEEDS)]
+        # Each split should have sample_size * num_labels * 2 rows:
+        # for each label we sample `sample_size`, and then we generate
+        # another `sample_size` samples through augmentation.
+        assert len(split) == sample_size * num_labels * 2
+
+
+def test_create_fewshot_splits_multilabel():
+    num_samples = 50
+    dataset = Dataset.from_dict(
+        {
+            "text": string.ascii_letters[:50],
+            "label_one": np.random.randint(2, size=(num_samples,)),
+            "label_two": np.random.randint(2, size=(num_samples,)),
+            "label_three": np.random.randint(2, size=(num_samples,)),
+        }
+    )
+    splits_ds = create_fewshot_splits_multilabel(dataset, SAMPLE_SIZES)
+    assert len(splits_ds) == len(SAMPLE_SIZES) * len(SEEDS)
+    # We can't safely test the number of rows of each of the splits
+    # as duplicate samples are removed.
+
 
 def test_sample_dataset_returns_expected_samples():
     num_samples = 2
@@ -130,3 +172,28 @@ def test_sample_dataset_with_unbalanced_ds(unbalanced_dataset):
     # has one label with more than `num_samples` entries and another label with just 1 row.
     # We sample `num_samples` from the former, and 1 from the latter.
     assert ds.num_rows == num_samples + 1
+
+
+@pytest.mark.parametrize(
+    "dataset",
+    [
+        "emotion",
+        "ag_news",
+        "amazon_counterfactual_en",
+        "SentEval-CR",
+        "sst5",
+        "enron_spam",
+        "tweet_eval_stance_abortion",
+        "ade_corpus_v2_classification",
+    ],
+)
+def test_get_augmented_samples(dataset: str):
+    dataset_dict = get_augmented_samples(dataset)
+    assert set(dataset_dict.keys()) == {"text", "label"}
+    assert len(dataset_dict["text"])
+    assert len(dataset_dict["label"])
+
+
+def test_get_augmented_samples_negative():
+    with pytest.raises(ValueError):
+        get_augmented_samples(None)