diff --git a/spacy/errors.py b/spacy/errors.py
index 093c65f3d1a..b6108dd0ff7 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -984,6 +984,9 @@ class Errors(metaclass=ErrorsWithCodes):
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
"but only callbacks with one or three parameters are supported")
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
+ E1057 = ("The `TextCatReduce` architecture must be used with at least one "
+ "reduction. Please enable one of `use_reduce_first`, "
+ "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
# Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index e6d1f030fef..93929bd4ec9 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -17,6 +17,9 @@
clone,
concatenate,
list2ragged,
+ reduce_first,
+ reduce_last,
+ reduce_max,
reduce_mean,
reduce_sum,
residual,
@@ -49,39 +52,15 @@ def build_simple_cnn_text_classifier(
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
is applied instead, so that outputs are in the range [0, 1].
"""
- fill_defaults = {"b": 0, "W": 0}
- with Model.define_operators({">>": chain}):
- cnn = tok2vec >> list2ragged() >> reduce_mean()
- nI = tok2vec.maybe_get_dim("nO")
- if exclusive_classes:
- output_layer = Softmax(nO=nO, nI=nI)
- fill_defaults["b"] = NEG_VALUE
- resizable_layer: Model = resizable(
- output_layer,
- resize_layer=partial(
- resize_linear_weighted, fill_defaults=fill_defaults
- ),
- )
- model = cnn >> resizable_layer
- else:
- output_layer = Linear(nO=nO, nI=nI)
- resizable_layer = resizable(
- output_layer,
- resize_layer=partial(
- resize_linear_weighted, fill_defaults=fill_defaults
- ),
- )
- model = cnn >> resizable_layer >> Logistic()
- model.set_ref("output_layer", output_layer)
- model.attrs["resize_output"] = partial(
- resize_and_set_ref,
- resizable_layer=resizable_layer,
- )
- model.set_ref("tok2vec", tok2vec)
- if nO is not None:
- model.set_dim("nO", cast(int, nO))
- model.attrs["multi_label"] = not exclusive_classes
- return model
+ return build_reduce_text_classifier(
+ tok2vec=tok2vec,
+ exclusive_classes=exclusive_classes,
+ use_reduce_first=False,
+ use_reduce_last=False,
+ use_reduce_max=False,
+ use_reduce_mean=True,
+ nO=nO,
+ )
def resize_and_set_ref(model, new_nO, resizable_layer):
@@ -230,3 +209,80 @@ def build_text_classifier_lowdata(
model = model >> Dropout(dropout)
model = model >> Logistic()
return model
+
+
+@registry.architectures("spacy.TextCatReduce.v1")
+def build_reduce_text_classifier(
+ tok2vec: Model,
+ exclusive_classes: bool,
+ use_reduce_first: bool,
+ use_reduce_last: bool,
+ use_reduce_max: bool,
+ use_reduce_mean: bool,
+ nO: Optional[int] = None,
+) -> Model[List[Doc], Floats2d]:
+ """Build a model that classifies pooled `Doc` representations.
+
+ Pooling is performed using reductions. Reductions are concatenated when
+ multiple reductions are used.
+
+ tok2vec (Model): the tok2vec layer to pool over.
+ exclusive_classes (bool): Whether or not classes are mutually exclusive.
+ use_reduce_first (bool): Pool by using the hidden representation of the
+ first token of a `Doc`.
+ use_reduce_last (bool): Pool by using the hidden representation of the
+ last token of a `Doc`.
+ use_reduce_max (bool): Pool by taking the maximum values of the hidden
+ representations of a `Doc`.
+ use_reduce_mean (bool): Pool by taking the mean of all hidden
+ representations of a `Doc`.
+ nO (Optional[int]): Number of classes.
+ """
+
+ fill_defaults = {"b": 0, "W": 0}
+ reductions = []
+ if use_reduce_first:
+ reductions.append(reduce_first())
+ if use_reduce_last:
+ reductions.append(reduce_last())
+ if use_reduce_max:
+ reductions.append(reduce_max())
+ if use_reduce_mean:
+ reductions.append(reduce_mean())
+
+ if not len(reductions):
+ raise ValueError(Errors.E1057)
+
+ with Model.define_operators({">>": chain}):
+ cnn = tok2vec >> list2ragged() >> concatenate(*reductions)
+ nO_tok2vec = tok2vec.maybe_get_dim("nO")
+ nI = nO_tok2vec * len(reductions) if nO_tok2vec is not None else None
+ if exclusive_classes:
+ output_layer = Softmax(nO=nO, nI=nI)
+ fill_defaults["b"] = NEG_VALUE
+ resizable_layer: Model = resizable(
+ output_layer,
+ resize_layer=partial(
+ resize_linear_weighted, fill_defaults=fill_defaults
+ ),
+ )
+ model = cnn >> resizable_layer
+ else:
+ output_layer = Linear(nO=nO, nI=nI)
+ resizable_layer = resizable(
+ output_layer,
+ resize_layer=partial(
+ resize_linear_weighted, fill_defaults=fill_defaults
+ ),
+ )
+ model = cnn >> resizable_layer >> Logistic()
+ model.set_ref("output_layer", output_layer)
+ model.attrs["resize_output"] = partial(
+ resize_and_set_ref,
+ resizable_layer=resizable_layer,
+ )
+ model.set_ref("tok2vec", tok2vec)
+ if nO is not None:
+ model.set_dim("nO", cast(int, nO))
+ model.attrs["multi_label"] = not exclusive_classes
+ return model
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 43a335c4ac7..ae227017a9f 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -55,8 +55,12 @@
single_label_cnn_config = """
[model]
-@architectures = "spacy.TextCatCNN.v2"
+@architectures = "spacy.TextCatReduce.v1"
exclusive_classes = true
+use_reduce_first = false
+use_reduce_last = false
+use_reduce_max = false
+use_reduce_mean = true
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index c917cc61078..2f8d5e60437 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -53,8 +53,12 @@
multi_label_cnn_config = """
[model]
-@architectures = "spacy.TextCatCNN.v2"
+@architectures = "spacy.TextCatReduce.v1"
exclusive_classes = false
+use_reduce_first = false
+use_reduce_last = false
+use_reduce_max = false
+use_reduce_mean = true
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 147ea49005c..5dff8d12455 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -457,8 +457,8 @@ def test_no_resize(name, textcat_config):
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
# CNN
- ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
- ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+ ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+ ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
],
)
# fmt: on
@@ -485,9 +485,9 @@ def test_resize(name, textcat_config):
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
- # CNN
- ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
- ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+ # REDUCE
+ ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+ ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
],
)
# fmt: on
@@ -701,9 +701,12 @@ def test_overfitting_IO_multi():
# ENSEMBLE V2
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
- # CNN V2
+ # CNN V2 (legacy)
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+ # REDUCE V1
+ ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+ ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
],
)
# fmt: on
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index e6692ad92c0..5228b4544fd 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -26,6 +26,7 @@
build_Tok2Vec_model,
)
from spacy.ml.staticvectors import StaticVectors
+from spacy.util import registry
def get_textcat_bow_kwargs():
@@ -284,3 +285,17 @@ def test_spancat_model_forward_backward(nO=5):
Y, backprop = model((docs, spans), is_train=True)
assert Y.shape == (spans.dataXd.shape[0], nO)
backprop(Y)
+
+
+def test_textcat_reduce_invalid_args():
+ textcat_reduce = registry.architectures.get("spacy.TextCatReduce.v1")
+ tok2vec = make_test_tok2vec()
+ with pytest.raises(ValueError, match=r"must be used with at least one reduction"):
+ textcat_reduce(
+ tok2vec=tok2vec,
+ exclusive_classes=False,
+ use_reduce_first=False,
+ use_reduce_last=False,
+ use_reduce_max=False,
+ use_reduce_mean=False,
+ )
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index 9d8b3ddfa5a..63f723a28cf 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -1018,46 +1018,6 @@ but used an internal `tok2vec` instead of taking it as argument:
-### spacy.TextCatCNN.v2 {id="TextCatCNN"}
-
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatCNN.v2"
-> exclusive_classes = false
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v2"
-> pretrained_vectors = null
-> width = 96
-> depth = 4
-> embed_size = 2000
-> window_size = 1
-> maxout_pieces = 3
-> subword_features = true
-> ```
-
-A neural network model where token vectors are calculated using a CNN. The
-vectors are mean pooled and used as features in a feed-forward network. This
-architecture is usually less accurate than the ensemble, but runs faster.
-
-| Name | Description |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
-| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
-| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
-
-
-
-[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
-not yet resizable. Since v2, new labels can be added to this component, even
-after training.
-
-
-
### spacy.TextCatBOW.v3 {id="TextCatBOW"}
> #### Example Config
@@ -1096,6 +1056,54 @@ the others, but may not be as accurate, especially if texts are short.
+### spacy.TextCatReduce.v1 {id="TextCatReduce"}
+
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatReduce.v1"
+> exclusive_classes = false
+> use_reduce_first = false
+> use_reduce_last = false
+> use_reduce_max = false
+> use_reduce_mean = true
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.HashEmbedCNN.v2"
+> pretrained_vectors = null
+> width = 96
+> depth = 4
+> embed_size = 2000
+> window_size = 1
+> maxout_pieces = 3
+> subword_features = true
+> ```
+
+A classifier that pools token hidden representations of each `Doc` using first,
+max or mean reduction and then applies a classification layer. Reductions are
+concatenated when multiple reductions are used.
+
+
+
+`TextCatReduce` is a generalization of the older
+[`TextCatCNN`](/api/legacy#TextCatCNN_v2) model. `TextCatCNN` always uses a mean
+reduction, whereas `TextCatReduce` also supports first/max reductions.
+
+
+
+| Name | Description |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
+| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
+| `use_reduce_first` | Pool by using the hidden representation of the first token of a `Doc`. ~~bool~~ |
+| `use_reduce_last` | Pool by using the hidden representation of the last token of a `Doc`. ~~bool~~ |
+| `use_reduce_max` | Pool by taking the maximum values of the hidden representations of a `Doc`. ~~bool~~ |
+| `use_reduce_mean` | Pool by taking the mean of all hidden representations of a `Doc`. ~~bool~~ |
+| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
+
## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"}
### spacy.SpanCategorizer.v1 {id="SpanCategorizer"}
diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx
index 32111ce9233..b44df538766 100644
--- a/website/docs/api/legacy.mdx
+++ b/website/docs/api/legacy.mdx
@@ -162,7 +162,10 @@ network has an internal CNN Tok2Vec layer and uses attention.
Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means
that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not
-yet support that.
+yet support that. `TextCatCNN` has been replaced by the more general
+[`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is
+identical to `TextCatReduce` with `use_reduce_mean=true`,
+`use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`.
> #### Example Config
>
@@ -194,6 +197,51 @@ architecture is usually less accurate than the ensemble, but runs faster.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
+### spacy.TextCatCNN.v2 {id="TextCatCNN_v2"}
+
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatCNN.v2"
+> exclusive_classes = false
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.HashEmbedCNN.v2"
+> pretrained_vectors = null
+> width = 96
+> depth = 4
+> embed_size = 2000
+> window_size = 1
+> maxout_pieces = 3
+> subword_features = true
+> ```
+
+A neural network model where token vectors are calculated using a CNN. The
+vectors are mean pooled and used as features in a feed-forward network. This
+architecture is usually less accurate than the ensemble, but runs faster.
+
+`TextCatCNN` has been replaced by the more general
+[`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is
+identical to `TextCatReduce` with `use_reduce_mean=true`,
+`use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`.
+
+| Name | Description |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
+| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
+| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
+
+
+
+[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
+not yet resizable. Since v2, new labels can be added to this component, even
+after training.
+
+
+
### spacy.TextCatBOW.v1 {id="TextCatBOW_v1"}
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means