Merge branch 'master' into half_testing2

Lightning-AI · Mar 29, 2021 · 9e21f63 · 9e21f63
2 parents 7e186c2 + 53d5701
commit 9e21f63
Show file tree

Hide file tree

Showing 35 changed files with 676 additions and 87 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `RetrievalMRR` metric for Information Retrieval ([#119](https://github.com/PyTorchLightning/metrics/pull/119))
 
 
+- Added `RetrievalPrecision` metric for Information Retrieval ([#119](https://github.com/PyTorchLightning/metrics/pull/119))
+
+
 - Added `average='micro'` as an option in AUROC for multilabel problems ([#110](https://github.com/PyTorchLightning/metrics/pull/110))
 
 
@@ -38,6 +41,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 )
 
 
+- Added `BootStrapper` to easely calculate confidence intervals for metrics ([#101](https://github.com/PyTorchLightning/metrics/pull/101))
+
+
 ### Changed
 
 - Changed `ExplainedVariance` from storing all preds/targets to tracking 5 statistics ([#68](https://github.com/PyTorchLightning/metrics/pull/68))

diff --git a/docs/source/references/functional.rst b/docs/source/references/functional.rst
@@ -248,7 +248,14 @@ retrieval_average_precision [func]
 
 
 retrieval_reciprocal_rank [func]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autofunction:: torchmetrics.functional.retrieval_reciprocal_rank
     :noindex:
+
+
+retrieval_precision [func]
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torchmetrics.functional.retrieval_precision
+    :noindex:
diff --git a/docs/source/references/modules.rst b/docs/source/references/modules.rst
@@ -334,3 +334,21 @@ RetrievalMRR
 
 .. autoclass:: torchmetrics.RetrievalMRR
     :noindex:
+
+
+RetrievalPrecision
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: torchmetrics.RetrievalPrecision
+    :noindex:
+
+
+********
+Wrappers
+********
+
+Modular wrapper metrics are not metrics in themself, but instead take a metric and alter the internal logic 
+of the base metric.
+
+.. autoclass:: torchmetrics.BootStrapper
+    :noindex:
diff --git a/tests/functional/test_retrieval.py b/tests/functional/test_retrieval.py
@@ -7,7 +7,9 @@
 
 from tests.helpers import seed_all
 from tests.retrieval.test_mrr import _reciprocal_rank as reciprocal_rank
+from tests.retrieval.test_precision import _precision_at_k as precision_at_k
 from torchmetrics.functional.retrieval.average_precision import retrieval_average_precision
+from torchmetrics.functional.retrieval.precision import retrieval_precision
 from torchmetrics.functional.retrieval.reciprocal_rank import retrieval_reciprocal_rank
 
 seed_all(1337)
@@ -42,9 +44,39 @@ def test_metrics_output_values(sklearn_metric, torch_metric, size):
             assert torch.allclose(sk.float(), tm.float())
 
 
+@pytest.mark.parametrize(['sklearn_metric', 'torch_metric'], [
+    [precision_at_k, retrieval_precision],
+])
+@pytest.mark.parametrize("size", [1, 4, 10])
+@pytest.mark.parametrize("k", [None, 1, 4, 10])
+def test_metrics_output_values_with_k(sklearn_metric, torch_metric, size, k):
+    """ Compare PL metrics to sklearn version. """
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    # test results are computed correctly wrt std implementation
+    for i in range(6):
+        preds = np.random.randn(size)
+        target = np.random.randn(size) > 0
+
+        # sometimes test with integer targets
+        if (i % 2) == 0:
+            target = target.astype(np.int)
+
+        sk = torch.tensor(sklearn_metric(target, preds, k), device=device)
+        tm = torch_metric(torch.tensor(preds, device=device), torch.tensor(target, device=device), k)
+
+        # `torch_metric`s return 0 when no label is True
+        # while `sklearn` metrics returns NaN
+        if math.isnan(sk):
+            assert tm == 0
+        else:
+            assert torch.allclose(sk.float(), tm.float())
+
+
 @pytest.mark.parametrize(['torch_metric'], [
     [retrieval_average_precision],
     [retrieval_reciprocal_rank],
+    [retrieval_precision]
 ])
 def test_input_dtypes(torch_metric) -> None:
     """ Check wrong input dtypes are managed correctly. """
@@ -75,6 +107,7 @@ def test_input_dtypes(torch_metric) -> None:
 @pytest.mark.parametrize(['torch_metric'], [
     [retrieval_average_precision],
     [retrieval_reciprocal_rank],
+    [retrieval_precision]
 ])
 def test_input_shapes(torch_metric) -> None:
     """ Check wrong input shapes are managed correctly. """
@@ -93,3 +126,19 @@ def test_input_shapes(torch_metric) -> None:
 
     with pytest.raises(ValueError, match="`preds` and `target` must be of the same shape"):
         torch_metric(preds, target)
+
+
+# test metrics using top K parameter
+@pytest.mark.parametrize(['torch_metric'], [
+    [retrieval_precision]
+])
+@pytest.mark.parametrize('k', [-1, 1.0])
+def test_input_params(torch_metric, k) -> None:
+    """ Check wrong input shapes are managed correctly. """
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    # test with random tensors
+    preds = torch.tensor([0] * 4, device=device, dtype=torch.float)
+    target = torch.tensor([0] * 4, device=device, dtype=torch.int64)
+    with pytest.raises(ValueError, match="`k` has to be a positive integer or None"):
+        torch_metric(preds, target, k=k)
diff --git a/tests/retrieval/helpers.py b/tests/retrieval/helpers.py
@@ -6,12 +6,13 @@
 from torch import Tensor
 
 from tests.helpers import seed_all
+from torchmetrics import Metric
 
 seed_all(1337)
 
 
 def _compute_sklearn_metric(
-    metric: Callable, target: List[np.ndarray], preds: List[np.ndarray], behaviour: str
+    metric: Callable, target: List[np.ndarray], preds: List[np.ndarray], behaviour: str, **kwargs
 ) -> Tensor:
     """ Compute metric with multiple iterations over every query predictions set. """
     sk_results = []
@@ -25,7 +26,7 @@ def _compute_sklearn_metric(
             else:
                 sk_results.append(0.0)
         else:
-            res = metric(b, a)
+            res = metric(b, a, **kwargs)
             sk_results.append(res)
 
     if len(sk_results) > 0:
@@ -34,10 +35,15 @@ def _compute_sklearn_metric(
 
 
 def _test_retrieval_against_sklearn(
-    sklearn_metric, torch_metric, size, n_documents, query_without_relevant_docs_options
+    sklearn_metric: Callable,
+    torch_metric: Metric,
+    size: int,
+    n_documents: int,
+    query_without_relevant_docs_options: str,
+    **kwargs
 ) -> None:
     """ Compare PL metrics to standard version. """
-    metric = torch_metric(query_without_relevant_docs=query_without_relevant_docs_options)
+    metric = torch_metric(query_without_relevant_docs=query_without_relevant_docs_options, **kwargs)
     shape = (size, )
 
     indexes = []
@@ -49,7 +55,7 @@ def _test_retrieval_against_sklearn(
         preds.append(np.random.randn(*shape))
         target.append(np.random.randn(*shape) > 0)
 
-    sk_results = _compute_sklearn_metric(sklearn_metric, target, preds, query_without_relevant_docs_options)
+    sk_results = _compute_sklearn_metric(sklearn_metric, target, preds, query_without_relevant_docs_options, **kwargs)
     sk_results = torch.tensor(sk_results)
 
     indexes_tensor = torch.cat([torch.tensor(i) for i in indexes]).long()
@@ -120,3 +126,9 @@ def _test_input_shapes(torchmetric) -> None:
 
     with pytest.raises(ValueError, match="`indexes`, `preds` and `target` must be of the same shape"):
         metric(indexes, preds, target)
+
+
+def _test_input_args(torchmetric: Metric, message: str, **kwargs) -> None:
+    """Check invalid args are managed correctly. """
+    with pytest.raises(ValueError, match=message):
+        torchmetric(**kwargs)
diff --git a/tests/retrieval/test_mrr.py b/tests/retrieval/test_mrr.py
@@ -1,25 +1,31 @@
 import numpy as np
 import pytest
+from sklearn.metrics import label_ranking_average_precision_score
 
 from tests.retrieval.helpers import _test_dtypes, _test_input_shapes, _test_retrieval_against_sklearn
 from torchmetrics.retrieval.mean_reciprocal_rank import RetrievalMRR
 
 
 def _reciprocal_rank(target: np.array, preds: np.array):
     """
-    Implementation of reciprocal rank because couldn't find a good implementation.
-    `sklearn.metrics.label_ranking_average_precision_score` is similar but works in a different way
-    then the number of positive labels is greater than 1.
+    Adaptation of `sklearn.metrics.label_ranking_average_precision_score`.
+    Since the original sklearn metric works as RR only when the number of positive
+    targets is exactly 1, here we remove every positive target that is not the most
+    important. Remember that in RR only the positive target with the highest score is considered.
     """
     assert target.shape == preds.shape
     assert len(target.shape) == 1  # works only with single dimension inputs
 
+    # going to remove T targets that are not ranked as highest
+    indexes = preds[target.astype(np.bool)]
+    if len(indexes) > 0:
+        target[preds != indexes.max(-1, keepdims=True)[0]] = 0  # ensure that only 1 positive label is present
+
     if target.sum() > 0:
-        target = target[np.argsort(preds, axis=-1)][::-1]
-        rank = np.nonzero(target)[0][0] + 1
-        return 1.0 / rank
+        # sklearn `label_ranking_average_precision_score` requires at most 2 dims
+        return label_ranking_average_precision_score(np.expand_dims(target, axis=0), np.expand_dims(preds, axis=0))
     else:
-        return np.NaN
+        return 0.0
 
 
 @pytest.mark.parametrize('size', [1, 4, 10])

diff --git a/tests/retrieval/test_precision.py b/tests/retrieval/test_precision.py
@@ -0,0 +1,56 @@
+import numpy as np
+import pytest
+
+from tests.retrieval.helpers import _test_dtypes, _test_input_args, _test_input_shapes, _test_retrieval_against_sklearn
+from torchmetrics.retrieval.retrieval_precision import RetrievalPrecision
+
+
+def _precision_at_k(target: np.array, preds: np.array, k: int = None):
+    """
+    Didn't find a reliable implementation of Precision in Information Retrieval, so,
+    reimplementing here. A good explanation can be found ``
+    """
+    assert target.shape == preds.shape
+    assert len(target.shape) == 1  # works only with single dimension inputs
+
+    if k is None:
+        k = len(preds)
+
+    if target.sum() > 0:
+        order_indexes = np.argsort(preds, axis=0)[::-1]
+        relevant = np.sum(target[order_indexes][:k])
+        return relevant * 1.0 / k
+    else:
+        return np.NaN
+
+
+@pytest.mark.parametrize('size', [1, 4, 10])
+@pytest.mark.parametrize('n_documents', [1, 5])
+@pytest.mark.parametrize('query_without_relevant_docs_options', ['skip', 'pos', 'neg'])
+@pytest.mark.parametrize('k', [None, 1, 4, 10])
+def test_results(size, n_documents, query_without_relevant_docs_options, k):
+    """ Test metrics are computed correctly. """
+    _test_retrieval_against_sklearn(
+        _precision_at_k,
+        RetrievalPrecision,
+        size,
+        n_documents,
+        query_without_relevant_docs_options,
+        k=k
+    )
+
+
+def test_dtypes():
+    """ Check dypes are managed correctly. """
+    _test_dtypes(RetrievalPrecision)
+
+
+def test_input_shapes() -> None:
+    """Check inputs shapes are managed correctly. """
+    _test_input_shapes(RetrievalPrecision)
+
+
+@pytest.mark.parametrize('k', [-1, 1.0])
+def test_input_params(k) -> None:
+    """Check invalid args are managed correctly. """
+    _test_input_args(RetrievalPrecision, "`k` has to be a positive integer or None", k=k)
diff --git a/tests/wrappers/__init__.py b/tests/wrappers/__init__.py