diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 54c880a7d89..2ef639cad52 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -1,10 +1,10 @@
from collections import Counter
from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
import numpy as np
import srsly
-from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
from thinc.types import ArrayXd, Floats2d, Ints1d
from .. import util
@@ -18,6 +18,10 @@
from .lemmatizer import lemmatizer_score
from .trainable_pipe import TrainablePipe
+# The cutoff value of *top_k* above which an alternative method is used to process guesses.
+TOP_K_GUARDRAIL = 20
+
+
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
@@ -50,7 +54,6 @@
"top_k": 1,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
"save_activations": False,
- "save_activations": False,
},
default_score_weights={"lemma_acc": 1.0},
)
@@ -64,7 +67,6 @@ def make_edit_tree_lemmatizer(
top_k: int,
scorer: Optional[Callable],
save_activations: bool,
- save_activations: bool,
):
"""Construct an EditTreeLemmatizer component."""
return EditTreeLemmatizer(
@@ -77,7 +79,6 @@ def make_edit_tree_lemmatizer(
top_k=top_k,
scorer=scorer,
save_activations=save_activations,
- save_activations=save_activations,
)
@@ -98,7 +99,6 @@ def __init__(
top_k: int = 1,
scorer: Optional[Callable] = lemmatizer_score,
save_activations: bool = False,
- save_activations: bool = False,
):
"""
Construct an edit tree lemmatizer.
@@ -111,7 +111,6 @@ def __init__(
overwrite (bool): overwrite existing lemma annotations.
top_k (int): try to apply at most the k most probable edit trees.
save_activations (bool): save model activations in Doc when annotating.
- save_activations (bool): save model activations in Doc when annotating.
"""
self.vocab = vocab
self.model = model
@@ -127,7 +126,6 @@ def __init__(
self.cfg: Dict[str, Any] = {"labels": []}
self.scorer = scorer
self.save_activations = save_activations
- self.save_activations = save_activations
def get_loss(
self, examples: Iterable[Example], scores: List[Floats2d]
@@ -156,25 +154,6 @@ def get_loss(
return float(loss), d_scores
- def get_teacher_student_loss(
- self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
- ) -> Tuple[float, List[Floats2d]]:
- """Calculate the loss and its gradient for a batch of student
- scores, relative to teacher scores.
-
- teacher_scores: Scores representing the teacher model's predictions.
- student_scores: Scores representing the student model's predictions.
-
- RETURNS (Tuple[float, float]): The loss and the gradient.
-
- DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
- """
- loss_func = SequenceCategoricalCrossentropy(normalize=False)
- d_scores, loss = loss_func(student_scores, teacher_scores)
- if self.model.ops.xp.isnan(loss):
- raise ValueError(Errors.E910.format(name=self.name))
- return float(loss), d_scores
-
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
n_docs = len(list(docs))
if not any(len(doc) for doc in docs):
@@ -186,21 +165,13 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
scores: List[Floats2d] = [
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
]
- guesses: List[Ints1d] = [
- self.model.ops.alloc((0,), dtype="i") for doc in docs
- ]
- scores: List[Floats2d] = [
- self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
- ]
assert len(guesses) == n_docs
return {"probabilities": scores, "tree_ids": guesses}
- return {"probabilities": scores, "tree_ids": guesses}
scores = self.model.predict(docs)
assert len(scores) == n_docs
guesses = scores2guesses(docs, scores)
assert len(guesses) == n_docs
return {"probabilities": scores, "tree_ids": guesses}
- return {"probabilities": scores, "tree_ids": guesses}
def _scores2guesses_top_k_equals_1(self, docs, scores):
guesses = []
@@ -260,15 +231,9 @@ def _scores2guesses_top_k_guardrail(self, docs, scores):
return guesses
- def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
- batch_tree_ids = activations["tree_ids"]
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
batch_tree_ids = activations["tree_ids"]
for i, doc in enumerate(docs):
- if self.save_activations:
- doc.activations[self.name] = {}
- for act_name, acts in activations.items():
- doc.activations[self.name][act_name] = acts[i]
if self.save_activations:
doc.activations[self.name] = {}
for act_name, acts in activations.items():
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index eb87d1db987..0f15ef38d45 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -6,37 +6,24 @@
from itertools import islice
import srsly
import random
-import warnings
from itertools import islice
from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Union, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
import srsly
-from numpy import dtype
from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
-from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from thinc.types import Floats2d
-from ..kb import KnowledgeBase, Candidate
-from ..ml import empty_kb
-from ..tokens import Doc, Span
-from ..ml import empty_kb
-from ..tokens import Doc, Span, SpanGroup
-from .pipe import deserialize_config
-from .trainable_pipe import TrainablePipe
-from ..language import Language
-from ..vocab import Vocab
-from ..training import Example, validate_examples, validate_get_examples
-from ..errors import Errors
-from ..util import SimpleFrozenList, registry
from .. import util
from ..errors import Errors
from ..kb import Candidate, KnowledgeBase
from ..language import Language
from ..scorer import Scorer
-from ..tokens import Doc, Span, SpanGroup
+from ..tokens import Doc, Span
from ..training import Example, validate_examples, validate_get_examples
from ..util import SimpleFrozenList, registry
from ..vocab import Vocab
+from .legacy.entity_linker import EntityLinker_v1
from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe
@@ -45,6 +32,9 @@
KNOWLEDGE_BASE_IDS = "kb_ids"
+# See #9050
+BACKWARD_OVERWRITE = True
+
default_model_config = """
[model]
@architectures = "spacy.EntityLinker.v2"
@@ -75,13 +65,13 @@
"entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
- "overwrite": False,
+ "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
+ "overwrite": True,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True,
"candidates_batch_size": 1,
"threshold": None,
"save_activations": False,
- "save_activations": False,
},
default_score_weights={
"nel_micro_f": 1.0,
@@ -101,7 +91,7 @@ def make_entity_linker(
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
get_candidates_batch: Callable[
- [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
+ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
],
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
overwrite: bool,
@@ -110,7 +100,6 @@ def make_entity_linker(
candidates_batch_size: int,
threshold: Optional[float] = None,
save_activations: bool,
- save_activations: bool,
):
"""Construct an EntityLinker component.
@@ -125,7 +114,7 @@ def make_entity_linker(
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
get_candidates_batch (
- Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
+ Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
scorer (Optional[Callable]): The scoring method.
@@ -135,11 +124,23 @@ def make_entity_linker(
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
prediction is discarded. If None, predictions are not filtered by any threshold.
save_activations (bool): save model activations in Doc when annotating.
- save_activations (bool): save model activations in Doc when annotating.
"""
- if not model.attrs.get("include_span_maker", False):
- raise ValueError(Errors.E4005)
+ if not model.attrs.get("include_span_maker", False):
+ # The only difference in arguments here is that use_gold_ents and threshold aren't available.
+ return EntityLinker_v1(
+ nlp.vocab,
+ model,
+ name,
+ labels_discard=labels_discard,
+ n_sents=n_sents,
+ incl_prior=incl_prior,
+ incl_context=incl_context,
+ entity_vector_length=entity_vector_length,
+ get_candidates=get_candidates,
+ overwrite=overwrite,
+ scorer=scorer,
+ )
return EntityLinker(
nlp.vocab,
model,
@@ -158,7 +159,6 @@ def make_entity_linker(
candidates_batch_size=candidates_batch_size,
threshold=threshold,
save_activations=save_activations,
- save_activations=save_activations,
)
@@ -192,15 +192,15 @@ def __init__(
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
get_candidates_batch: Callable[
- [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
+ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
],
- overwrite: bool = False,
+ generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
+ overwrite: bool = BACKWARD_OVERWRITE,
scorer: Optional[Callable] = entity_linker_score,
use_gold_ents: bool,
candidates_batch_size: int,
threshold: Optional[float] = None,
save_activations: bool = False,
- save_activations: bool = False,
) -> None:
"""Initialize an entity linker.
@@ -216,10 +216,10 @@ def __init__(
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
get_candidates_batch (
- Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
+ Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
- overwrite (bool): Whether to overwrite existing non-empty annotations.
+ generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations.
@@ -255,12 +255,9 @@ def __init__(
self.candidates_batch_size = candidates_batch_size
self.threshold = threshold
self.save_activations = save_activations
- self.save_activations = save_activations
if candidates_batch_size < 1:
raise ValueError(Errors.E1044)
- if self.incl_prior and not self.kb.supports_prior_probs:
- warnings.warn(Warnings.W401)
def _score_with_ents_set(examples: Iterable[Example], **kwargs):
# Because of how spaCy works, we can't just score immediately, because Language.evaluate
@@ -463,7 +460,6 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
loss = loss / len(entity_encodings)
return float(loss), out
- def predict(self, docs: Iterable[Doc]) -> ActivationsT:
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
Returns the KB IDs for each entity in each doc, including NIL if there is
@@ -481,47 +477,39 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
xp = ops.xp
docs_ents: List[Ragged] = []
docs_scores: List[Ragged] = []
- ops = self.model.ops
- xp = ops.xp
- docs_ents: List[Ragged] = []
- docs_scores: List[Ragged] = []
if not docs:
- return {
- KNOWLEDGE_BASE_IDS: final_kb_ids,
- "ents": docs_ents,
- "scores": docs_scores,
- }
+ return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
if isinstance(docs, Doc):
docs = [docs]
- for doc in docs:
- doc_ents: List[Ints1d] = []
- doc_scores: List[Floats1d] = []
for doc in docs:
doc_ents: List[Ints1d] = []
doc_scores: List[Floats1d] = []
if len(doc) == 0:
- docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
- docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
continue
sentences = [s for s in doc.sents]
- # Loop over entities in batches.
- for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
- ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
-
- # Look up candidate entities.
- valid_ent_idx = [
- idx
- for idx in range(len(ent_batch))
- if ent_batch[idx].label_ not in self.labels_discard
- ]
-
- batch_candidates = list(
- self.get_candidates_batch(
- self.kb,
- SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
+ if self.incl_context:
+ # get n_neighbour sentences, clipped to the length of the document
+ start_sentence = max(0, sent_index - self.n_sents)
+ end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
+ start_token = sentences[start_sentence].start
+ end_token = sentences[end_sentence].end
+ sent_doc = doc[start_token:end_token].as_doc()
+ # currently, the context is the same for each entity in a sentence (should be refined)
+ sentence_encoding = self.model.predict([sent_doc])[0]
+ sentence_encoding_t = sentence_encoding.T
+ sentence_norm = xp.linalg.norm(sentence_encoding_t)
+ entity_count += 1
+ if ent.label_ in self.labels_discard:
+ # ignoring this entity - setting to NIL
+ final_kb_ids.append(self.NIL)
+ self._add_activations(
+ doc_scores=doc_scores,
+ doc_ents=doc_ents,
+ scores=[0.0],
+ ents=[0],
)
else:
candidates = list(self.get_candidates(self.kb, ent))
@@ -592,39 +580,23 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
method="predict", msg="result variables not of equal length"
)
raise RuntimeError(err)
- return {
- KNOWLEDGE_BASE_IDS: final_kb_ids,
- "ents": docs_ents,
- "scores": docs_scores,
- }
+ return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
- def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations (ActivationsT): The activations used for setting annotations, produced
by EntityLinker.predict.
- activations (ActivationsT): The activations used for setting annotations, produced
- by EntityLinker.predict.
DOCS: https://spacy.io/api/entitylinker#set_annotations
"""
kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
- kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
count_ents = len([ent for doc in docs for ent in doc.ents])
if count_ents != len(kb_ids):
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
i = 0
overwrite = self.cfg["overwrite"]
- for j, doc in enumerate(docs):
- if self.save_activations:
- doc.activations[self.name] = {}
- for act_name, acts in activations.items():
- if act_name != KNOWLEDGE_BASE_IDS:
- # We only copy activations that are Ragged.
- doc.activations[self.name][act_name] = cast(Ragged, acts[j])
-
for j, doc in enumerate(docs):
if self.save_activations:
doc.activations[self.name] = {}
@@ -760,32 +732,3 @@ def _add_activations(
ops = self.model.ops
doc_scores.append(ops.asarray1f(scores))
doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
-
- def _add_doc_activations(
- self,
- *,
- docs_scores: List[Ragged],
- docs_ents: List[Ragged],
- doc_scores: List[Floats1d],
- doc_ents: List[Ints1d],
- ):
- if not self.save_activations:
- return
- ops = self.model.ops
- lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
- docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
- docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
-
- def _add_activations(
- self,
- *,
- doc_scores: List[Floats1d],
- doc_ents: List[Ints1d],
- scores: Sequence[float],
- ents: Sequence[int],
- ):
- if not self.save_activations:
- return
- ops = self.model.ops
- doc_scores.append(ops.asarray1f(scores))
- doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 443b6818dc2..cc8f87936b9 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,11 +1,10 @@
# cython: infer_types=True, profile=True, binding=True
from typing import Callable, Dict, Iterable, List, Optional, Union
import srsly
-from thinc.api import Model, Config
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from thinc.types import Floats2d, Ints1d
from itertools import islice
-from typing import Callable, Dict, Iterable, Optional, Union
+from typing import Callable, Dict, Optional, Union
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
@@ -24,10 +23,13 @@ from ..errors import Errors
from ..language import Language
from ..parts_of_speech import IDS as POS_IDS
from ..scorer import Scorer
-from ..symbols import POS
from ..training import validate_examples, validate_get_examples
from ..util import registry
-from .tagger import ActivationsT, Tagger
+from .tagger import Tagger
+
+# See #9050
+BACKWARD_OVERWRITE = True
+BACKWARD_EXTEND = False
default_model_config = """
[model]
@@ -65,13 +67,6 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
"save_activations": False,
},
- default_config={
- "model": DEFAULT_MORPH_MODEL,
- "overwrite": True,
- "extend": False,
- "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
- "save_activations": False,
- },
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
)
def make_morphologizer(
@@ -83,12 +78,9 @@ def make_morphologizer(
label_smoothing: float,
scorer: Optional[Callable],
save_activations: bool,
- save_activations: bool,
):
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
save_activations=save_activations)
- return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
- save_activations=save_activations)
def morphologizer_score(examples, **kwargs):
@@ -120,11 +112,11 @@ class Morphologizer(Tagger):
model: Model,
name: str = "morphologizer",
*,
- overwrite: bool = False,
- extend: bool = False,
+ overwrite: bool = BACKWARD_OVERWRITE,
+ extend: bool = BACKWARD_EXTEND,
+ label_smoothing: float = 0.0,
scorer: Optional[Callable] = morphologizer_score,
save_activations: bool = False,
- save_activations: bool = False,
):
"""Initialize a morphologizer.
@@ -132,13 +124,10 @@ class Morphologizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
- overwrite (bool): Whether to overwrite existing annotations.
- extend (bool): Whether to extend existing annotations.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attributes "pos" and "morph" and
Scorer.score_token_attr_per_feat for the attribute "morph".
save_activations (bool): save model activations in Doc when annotating.
- save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/morphologizer#init
"""
@@ -160,7 +149,6 @@ class Morphologizer(Tagger):
self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
self.save_activations = save_activations
- self.save_activations = save_activations
@property
def labels(self):
@@ -254,18 +242,15 @@ class Morphologizer(Tagger):
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
- def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
- activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
DOCS: https://spacy.io/api/morphologizer#set_annotations
"""
batch_tag_ids = activations["label_ids"]
- batch_tag_ids = activations["label_ids"]
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
@@ -276,10 +261,6 @@ class Morphologizer(Tagger):
# to allocate a compatible container out of the iterable.
labels = tuple(self.labels)
for i, doc in enumerate(docs):
- if self.save_activations:
- doc.activations[self.name] = {}
- for act_name, acts in activations.items():
- doc.activations[self.name][act_name] = acts[i]
if self.save_activations:
doc.activations[self.name] = {}
for act_name, acts in activations.items():
@@ -321,8 +302,7 @@ class Morphologizer(Tagger):
DOCS: https://spacy.io/api/morphologizer#get_loss
"""
validate_examples(examples, "Morphologizer.get_loss")
- loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
- label_smoothing=self.cfg["label_smoothing"])
+ loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
truths = []
for eg in examples:
eg_truths = []
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 35627bbf2ad..521afe1d181 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,18 +1,25 @@
# cython: infer_types=True, profile=True, binding=True
+from typing import Dict, Iterable, Optional, Callable, List, Union
from itertools import islice
-from typing import Callable, Dict, Iterable, List, Optional, Union
+from typing import Callable, Optional
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+import srsly
+from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
from ..tokens.doc cimport Doc
-from .. import util
+from .tagger import ActivationsT, Tagger
+from ..language import Language
from ..errors import Errors
from ..language import Language
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
from ..util import registry
-from .tagger import ActivationsT, Tagger
+from .tagger import Tagger
+
+# See #9050
+BACKWARD_OVERWRITE = False
default_model_config = """
[model]
@@ -40,12 +47,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
"save_activations": False,
},
- default_config={
- "model": DEFAULT_SENTER_MODEL,
- "overwrite": False,
- "scorer": {"@scorers": "spacy.senter_scorer.v1"},
- "save_activations": False,
- },
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_senter(nlp: Language,
@@ -55,13 +56,6 @@ def make_senter(nlp: Language,
scorer: Optional[Callable],
save_activations: bool):
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
-def make_senter(nlp: Language,
- name: str,
- model: Model,
- overwrite: bool,
- scorer: Optional[Callable],
- save_activations: bool):
- return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
def senter_score(examples, **kwargs):
@@ -89,10 +83,9 @@ class SentenceRecognizer(Tagger):
model,
name="senter",
*,
- overwrite=False,
+ overwrite=BACKWARD_OVERWRITE,
scorer=senter_score,
save_activations: bool = False,
- save_activations: bool = False,
):
"""Initialize a sentence recognizer.
@@ -100,11 +93,9 @@ class SentenceRecognizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
- overwrite (bool): Whether to overwrite existing annotations.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents".
save_activations (bool): save model activations in Doc when annotating.
- save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/sentencerecognizer#init
"""
@@ -115,7 +106,6 @@ class SentenceRecognizer(Tagger):
self.cfg = {"overwrite": overwrite}
self.scorer = scorer
self.save_activations = save_activations
- self.save_activations = save_activations
@property
def labels(self):
@@ -133,27 +123,20 @@ class SentenceRecognizer(Tagger):
def label_data(self):
return None
- def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
- activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
"""
batch_tag_ids = activations["label_ids"]
- batch_tag_ids = activations["label_ids"]
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef bint overwrite = self.cfg["overwrite"]
for i, doc in enumerate(docs):
- if self.save_activations:
- doc.activations[self.name] = {}
- for act_name, acts in activations.items():
- doc.activations[self.name][act_name] = acts[i]
if self.save_activations:
doc.activations[self.name] = {}
for act_name, acts in activations.items():
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 9d9415692a8..1450bb5d6cb 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,29 +1,14 @@
-from dataclasses import dataclass
-from functools import partial
-from typing import (
- Any,
- Callable,
- Dict,
- Iterable,
- List,
- Optional,
- Protocol,
- Tuple,
- Union,
- cast,
- runtime_checkable,
-)
+from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
+from typing import Union
+from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
+from thinc.api import Optimizer
+from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
import numpy
from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
-from ..scorer import Scorer
-from ..language import Language
-from .trainable_pipe import TrainablePipe
-from ..tokens import Doc, SpanGroup, Span
-from ..vocab import Vocab
-from ..training import Example, validate_examples
+from ..compat import Protocol, runtime_checkable
from ..errors import Errors
from ..language import Language
from ..scorer import Scorer
@@ -36,9 +21,6 @@
ActivationsT = Dict[str, Union[Floats2d, Ragged]]
-ActivationsT = Dict[str, Union[Floats2d, Ragged]]
-
-
spancat_default_config = """
[model]
@architectures = "spacy.SpanCategorizer.v1"
@@ -194,7 +176,6 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
"save_activations": False,
- "save_activations": False,
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)
@@ -208,7 +189,6 @@ def make_spancat(
threshold: float,
max_positive: Optional[int],
save_activations: bool,
- save_activations: bool,
) -> "SpanCategorizer":
"""Create a SpanCategorizer component and configure it for multi-label
classification to be able to assign multiple labels for each span.
@@ -237,7 +217,6 @@ def make_spancat(
max_positive (Optional[int]): Maximum number of labels to consider positive
per span. Defaults to None, indicating no limit.
save_activations (bool): save model activations in Doc when annotating.
- save_activations (bool): save model activations in Doc when annotating.
"""
return SpanCategorizer(
nlp.vocab,
@@ -317,7 +296,6 @@ def make_spancat_singlelabel(
threshold=None,
scorer=scorer,
save_activations=save_activations,
- save_activations=save_activations,
)
@@ -381,7 +359,6 @@ def __init__(
threshold: Optional[float] = 0.5,
scorer: Optional[Callable] = spancat_score,
save_activations: bool = False,
- save_activations: bool = False,
) -> None:
"""Initialize the multi-label or multi-class span categorizer.
@@ -432,7 +409,6 @@ def __init__(
self.name = name
self.scorer = scorer
self.save_activations = save_activations
- self.save_activations = save_activations
@property
def key(self) -> str:
@@ -490,7 +466,6 @@ def label_data(self) -> List[str]:
"""
return list(self.labels)
- def predict(self, docs: Iterable[Doc]) -> ActivationsT:
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
@@ -502,8 +477,6 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
indices = self.suggester(docs, ops=self.model.ops)
scores = self.model.predict((docs, indices)) # type: ignore
return {"indices": indices, "scores": scores}
- scores = self.model.predict((docs, indices)) # type: ignore
- return {"indices": indices, "scores": scores}
def set_candidates(
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@@ -523,13 +496,11 @@ def set_candidates(
for index in candidates.dataXd:
doc.spans[candidates_key].append(doc[index[0] : index[1]])
- def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
"""Modify a batch of Doc objects, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
- activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
DOCS: https://spacy.io/api/spancategorizer#set_annotations
"""
@@ -538,9 +509,10 @@ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> Non
indices = activations["indices"]
assert isinstance(indices, Ragged)
scores = cast(Floats2d, activations["scores"])
+
offset = 0
for i, doc in enumerate(docs):
- indices_i = cast(Ints2d, indices[i].dataXd)
+ indices_i = indices[i].dataXd
if self.save_activations:
doc.activations[self.name] = {}
doc.activations[self.name]["indices"] = indices_i
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index ccd401b6af9..8ecd0c46ee0 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,37 +1,32 @@
# cython: infer_types=True, profile=True, binding=True
from typing import Callable, Dict, Iterable, List, Optional, Union
-from typing import Tuple
import numpy
import srsly
-from thinc.api import Model, set_dropout_rate, Config
-from thinc.legacy import LegacySequenceCategoricalCrossentropy
+from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
from thinc.types import Floats2d, Ints1d
import warnings
from itertools import islice
-from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Callable, Optional
import numpy
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
-from thinc.types import Floats2d, Ints1d
-from ..morphology cimport Morphology
from ..tokens.doc cimport Doc
-from ..vocab cimport Vocab
from .. import util
-from ..attrs import ID, POS
-from ..errors import Errors, Warnings
+from ..errors import Errors
from ..language import Language
-from ..parts_of_speech import X
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
from ..util import registry
-from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+# See #9050
+BACKWARD_OVERWRITE = False
+
default_model_config = """
[model]
@architectures = "spacy.Tagger.v2"
@@ -59,13 +54,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
"neg_prefix": "!",
"save_activations": False,
},
- default_config={
- "model": DEFAULT_TAGGER_MODEL,
- "overwrite": False,
- "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
- "neg_prefix": "!",
- "save_activations": False,
- },
default_score_weights={"tag_acc": 1.0},
)
def make_tagger(
@@ -76,7 +64,6 @@ def make_tagger(
scorer: Optional[Callable],
neg_prefix: str,
save_activations: bool,
- save_activations: bool,
):
"""Construct a part-of-speech tagger component.
@@ -87,8 +74,6 @@ def make_tagger(
"""
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
save_activations=save_activations)
- return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
- save_activations=save_activations)
def tagger_score(examples, **kwargs):
@@ -111,11 +96,10 @@ class Tagger(TrainablePipe):
model,
name="tagger",
*,
- overwrite=False,
+ overwrite=BACKWARD_OVERWRITE,
scorer=tagger_score,
neg_prefix="!",
save_activations: bool = False,
- save_activations: bool = False,
):
"""Initialize a part-of-speech tagger.
@@ -123,11 +107,9 @@ class Tagger(TrainablePipe):
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
- overwrite (bool): Whether to overwrite existing annotations.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attribute "tag".
save_activations (bool): save model activations in Doc when annotating.
- save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/tagger#init
"""
@@ -139,7 +121,6 @@ class Tagger(TrainablePipe):
self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
self.save_activations = save_activations
- self.save_activations = save_activations
@property
def labels(self):
@@ -158,7 +139,6 @@ class Tagger(TrainablePipe):
"""Data about the labels currently added to the component."""
return tuple(self.cfg["labels"])
- def predict(self, docs) -> ActivationsT:
def predict(self, docs) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
@@ -173,13 +153,11 @@ class Tagger(TrainablePipe):
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
assert len(guesses) == len(docs)
return {"probabilities": guesses, "label_ids": guesses}
- return {"probabilities": guesses, "label_ids": guesses}
scores = self.model.predict(docs)
assert len(scores) == len(docs), (len(scores), len(docs))
guesses = self._scores2guesses(scores)
assert len(guesses) == len(docs)
return {"probabilities": scores, "label_ids": guesses}
- return {"probabilities": scores, "label_ids": guesses}
def _scores2guesses(self, scores):
guesses = []
@@ -190,28 +168,21 @@ class Tagger(TrainablePipe):
guesses.append(doc_guesses)
return guesses
- def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
- activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
DOCS: https://spacy.io/api/tagger#set_annotations
"""
batch_tag_ids = activations["label_ids"]
- batch_tag_ids = activations["label_ids"]
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef bint overwrite = self.cfg["overwrite"]
labels = self.labels
for i, doc in enumerate(docs):
- if self.save_activations:
- doc.activations[self.name] = {}
- for act_name, acts in activations.items():
- doc.activations[self.name][act_name] = acts[i]
if self.save_activations:
doc.activations[self.name] = {}
for act_name, acts in activations.items():
@@ -271,6 +242,7 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger#rehearse
"""
+ loss_func = SequenceCategoricalCrossentropy()
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
@@ -284,32 +256,12 @@ class Tagger(TrainablePipe):
set_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update(docs)
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
- loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
+ grads, loss = loss_func(tag_scores, tutor_tag_scores)
bp_tag_scores(grads)
- if sgd is not None:
- self.finish_update(sgd)
+ self.finish_update(sgd)
losses[self.name] += loss
return losses
- def get_teacher_student_loss(
- self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
- ) -> Tuple[float, List[Floats2d]]:
- """Calculate the loss and its gradient for a batch of student
- scores, relative to teacher scores.
-
- teacher_scores: Scores representing the teacher model's predictions.
- student_scores: Scores representing the student model's predictions.
-
- RETURNS (Tuple[float, float]): The loss and the gradient.
-
- DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
- """
- loss_func = SequenceCategoricalCrossentropy(normalize=False)
- d_scores, loss = loss_func(student_scores, teacher_scores)
- if self.model.ops.xp.isnan(loss):
- raise ValueError(Errors.E910.format(name=self.name))
- return float(loss), d_scores
-
def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.
@@ -321,12 +273,7 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger#get_loss
"""
validate_examples(examples, "Tagger.get_loss")
- loss_func = SequenceCategoricalCrossentropy(
- names=self.labels,
- normalize=False,
- neg_prefix=self.cfg["neg_prefix"],
- label_smoothing=self.cfg["label_smoothing"]
- )
+ loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"])
# Convert empty tag "" to missing value None so that both misaligned
# tokens and tokens with missing annotation have the default missing
# value None.
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 13841dd7bbb..79a98b9bc5f 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
+from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
+import numpy
from itertools import islice
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index e1c1fdc7a34..ac024ba3639 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,5 +1,9 @@
+from typing import Iterable, Optional, Dict, List, Callable, Any, Union
+from thinc.types import Floats2d
+from thinc.api import Model, Config
+
from itertools import islice
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional
from thinc.api import Config, Model
from thinc.types import Floats2d
@@ -80,8 +84,6 @@
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
"save_activations": False,
- "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
- "save_activations": False,
},
default_score_weights={
"cats_score": 1.0,
@@ -103,9 +105,6 @@ def make_multilabel_textcat(
threshold: float,
scorer: Optional[Callable],
save_activations: bool,
-) -> "TextCategorizer":
- """Create a TextCategorizer component. The text categorizer predicts categories
- save_activations: bool,
) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
@@ -124,12 +123,6 @@ def make_multilabel_textcat(
threshold=threshold,
scorer=scorer,
save_activations=save_activations,
- nlp.vocab,
- model,
- name,
- threshold=threshold,
- scorer=scorer,
- save_activations=save_activations,
)
@@ -162,7 +155,6 @@ def __init__(
threshold: float,
scorer: Optional[Callable] = textcat_multilabel_score,
save_activations: bool = False,
- save_activations: bool = False,
) -> None:
"""Initialize a text categorizer for multi-label classification.
@@ -171,7 +163,6 @@ def __init__(
name (str): The component instance name, used to add entries to the
losses during training.
threshold (float): Cutoff to consider a prediction "positive".
- scorer (Optional[Callable]): The scoring method.
save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/textcategorizer#init
@@ -184,7 +175,6 @@ def __init__(
self.cfg = dict(cfg)
self.scorer = scorer
self.save_activations = save_activations
- self.save_activations = save_activations
@property
def support_missing_values(self):
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 546a1c48abb..bd360c9501b 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
import srsly
@@ -7,13 +7,13 @@ import warnings
from ..tokens.doc cimport Doc
-from ..training import validate_examples, validate_distillation_examples
+from ..training import validate_examples
from ..errors import Errors, Warnings
from .pipe import Pipe, deserialize_config
from .. import util
from ..errors import Errors
from ..language import Language
-from ..training import Example, validate_distillation_examples, validate_examples
+from ..training import Example, validate_examples
from ..vocab import Vocab
from .pipe import Pipe, deserialize_config
@@ -59,54 +59,7 @@ cdef class TrainablePipe(Pipe):
except Exception as e:
error_handler(self.name, self, [doc], e)
-
- def distill(self,
- teacher_pipe: Optional["TrainablePipe"],
- examples: Iterable["Example"],
- *,
- drop: float=0.0,
- sgd: Optional[Optimizer]=None,
- losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
- """Train a pipe (the student) on the predictions of another pipe
- (the teacher). The student is typically trained on the probability
- distribution of the teacher, but details may differ per pipe.
-
- teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
- from.
- examples (Iterable[Example]): Distillation examples. The reference
- (teacher) and predicted (student) docs must have the same number of
- tokens and the same orthography.
- drop (float): dropout rate.
- sgd (Optional[Optimizer]): An optimizer. Will be created via
- create_optimizer if not set.
- losses (Optional[Dict[str, float]]): Optional record of loss during
- distillation.
- RETURNS: The updated losses dictionary.
-
- DOCS: https://spacy.io/api/pipe#distill
- """
- # By default we require a teacher pipe, but there are downstream
- # implementations that don't require a pipe.
- if teacher_pipe is None:
- raise ValueError(Errors.E4002.format(name=self.name))
- if losses is None:
- losses = {}
- losses.setdefault(self.name, 0.0)
- validate_distillation_examples(examples, "TrainablePipe.distill")
- set_dropout_rate(self.model, drop)
- for node in teacher_pipe.model.walk():
- if node.name == "softmax":
- node.attrs["softmax_normalize"] = True
- teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
- student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
- loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
- bp_student_scores(d_scores)
- if sgd is not None:
- self.finish_update(sgd)
- losses[self.name] += loss
- return losses
-
- def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
+ def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
@@ -219,19 +172,6 @@ cdef class TrainablePipe(Pipe):
"""
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
- def get_teacher_student_loss(self, teacher_scores, student_scores):
- """Calculate the loss and its gradient for a batch of student
- scores, relative to teacher scores.
-
- teacher_scores: Scores representing the teacher model's predictions.
- student_scores: Scores representing the student model's predictions.
-
- RETURNS (Tuple[float, float]): The loss and the gradient.
-
- DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
- """
- raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
-
def create_optimizer(self) -> Optimizer:
"""Create an optimizer for the pipeline component.
@@ -268,14 +208,6 @@ cdef class TrainablePipe(Pipe):
"""
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
- @property
- def is_distillable(self) -> bool:
- # Normally a pipe overrides `get_teacher_student_loss` to implement
- # distillation. In more exceptional cases, a pipe can provide its
- # own `distill` implementation. If neither of these methods is
- # overridden, the pipe does not implement distillation.
- return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
-
@property
def is_trainable(self) -> bool:
return True
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 0f925c0d4e1..ba2ed4e5ff3 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,5 +1,5 @@
-import pickle
from typing import cast
+import pickle
import hypothesis.strategies as st
import pytest
@@ -10,6 +10,7 @@
from spacy.language import Language
from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
from spacy.pipeline.trainable_pipe import TrainablePipe
+from spacy.training import Example
from spacy.strings import StringStore
from spacy.training import Example
from spacy.util import make_tempdir
@@ -213,53 +214,6 @@ def test_overfitting_IO(top_k):
assert doc4[3].lemma_ == "egg"
-def test_is_distillable():
- nlp = English()
- lemmatizer = nlp.add_pipe("trainable_lemmatizer")
- assert lemmatizer.is_distillable
-
-
-def test_distill():
- teacher = English()
- teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer")
- teacher_lemmatizer.min_tree_freq = 1
- train_examples = []
- for t in TRAIN_DATA:
- train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
-
- optimizer = teacher.initialize(get_examples=lambda: train_examples)
-
- for i in range(50):
- losses = {}
- teacher.update(train_examples, sgd=optimizer, losses=losses)
- assert losses["trainable_lemmatizer"] < 0.00001
-
- student = English()
- student_lemmatizer = student.add_pipe("trainable_lemmatizer")
- student_lemmatizer.min_tree_freq = 1
- student_lemmatizer.initialize(
- get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data
- )
-
- distill_examples = [
- Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
- ]
-
- for i in range(50):
- losses = {}
- student_lemmatizer.distill(
- teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses
- )
- assert losses["trainable_lemmatizer"] < 0.00001
-
- test_text = "She likes blue eggs"
- doc = student(test_text)
- assert doc[0].lemma_ == "she"
- assert doc[1].lemma_ == "like"
- assert doc[2].lemma_ == "blue"
- assert doc[3].lemma_ == "egg"
-
-
def test_lemmatizer_requires_labels():
nlp = English()
nlp.add_pipe("trainable_lemmatizer")
@@ -403,26 +357,3 @@ def test_save_activations():
]
assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
-
-
-def test_save_activations():
- nlp = English()
- lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer"))
- lemmatizer.min_tree_freq = 1
- train_examples = []
- for t in TRAIN_DATA:
- train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
- nlp.initialize(get_examples=lambda: train_examples)
- nO = lemmatizer.model.get_dim("nO")
-
- doc = nlp("This is a test.")
- assert "trainable_lemmatizer" not in doc.activations
-
- lemmatizer.save_activations = True
- doc = nlp("This is a test.")
- assert list(doc.activations["trainable_lemmatizer"].keys()) == [
- "probabilities",
- "tree_ids",
- ]
- assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
- assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index a3ab80f7ee0..32e7a265f37 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,18 +1,17 @@
-from typing import Any, Callable, Dict, Iterable, cast
+from typing import Callable, Iterable, Dict, Any, cast
import pytest
from numpy.testing import assert_equal
from thinc.types import Ragged
-from thinc.types import Ragged
from spacy import Language, registry, util
from spacy.attrs import ENT_KB_ID
from spacy.compat import pickle
-from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
+from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates
from spacy.lang.en import English
from spacy.ml import load_kb
-from spacy.ml.models.entity_linker import build_span_maker, get_candidates
from spacy.pipeline import EntityLinker, TrainablePipe
+from spacy.pipeline.legacy import EntityLinker_v1
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
from spacy.tests.util import make_tempdir
@@ -454,17 +453,16 @@ def test_candidate_generation(nlp):
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
# test the size of the relevant candidates
- adam_ent_cands = get_candidates(mykb, adam_ent)
assert len(get_candidates(mykb, douglas_ent)) == 2
- assert len(adam_ent_cands) == 1
+ assert len(get_candidates(mykb, adam_ent)) == 1
assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive
assert len(get_candidates(mykb, shrubbery_ent)) == 0
# test the content of the candidates
- assert adam_ent_cands[0].entity_id_ == "Q2"
- assert adam_ent_cands[0].alias == "adam"
- assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
- assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
+ assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
+ assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
+ assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
+ assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
def test_el_pipe_configuration(nlp):
@@ -492,7 +490,7 @@ def create_kb(vocab):
assert doc[2].ent_kb_id_ == "Q2"
def get_lowercased_candidates(kb, span):
- return kb._get_alias_candidates(span.text.lower())
+ return kb.get_alias_candidates(span.text.lower())
def get_lowercased_candidates_batch(kb, spans):
return [get_lowercased_candidates(kb, span) for span in spans]
@@ -551,22 +549,24 @@ def test_vocab_serialization(nlp):
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
- candidates = mykb._get_alias_candidates("adam")
+ candidates = mykb.get_alias_candidates("adam")
assert len(candidates) == 1
- assert candidates[0].entity_id == q2_hash
- assert candidates[0].entity_id_ == "Q2"
- assert candidates[0].alias == "adam"
+ assert candidates[0].entity == q2_hash
+ assert candidates[0].entity_ == "Q2"
+ assert candidates[0].alias == adam_hash
+ assert candidates[0].alias_ == "adam"
with make_tempdir() as d:
mykb.to_disk(d / "kb")
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
kb_new_vocab.from_disk(d / "kb")
- candidates = kb_new_vocab._get_alias_candidates("adam")
+ candidates = kb_new_vocab.get_alias_candidates("adam")
assert len(candidates) == 1
- assert candidates[0].entity_id == q2_hash
- assert candidates[0].entity_id_ == "Q2"
- assert candidates[0].alias == "adam"
+ assert candidates[0].entity == q2_hash
+ assert candidates[0].entity_ == "Q2"
+ assert candidates[0].alias == adam_hash
+ assert candidates[0].alias_ == "adam"
assert kb_new_vocab.get_vector("Q2") == [2]
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@@ -586,20 +586,20 @@ def test_append_alias(nlp):
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
# test the size of the relevant candidates
- assert len(mykb._get_alias_candidates("douglas")) == 2
+ assert len(mykb.get_alias_candidates("douglas")) == 2
# append an alias
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
# test the size of the relevant candidates has been incremented
- assert len(mykb._get_alias_candidates("douglas")) == 3
+ assert len(mykb.get_alias_candidates("douglas")) == 3
# append the same alias-entity pair again should not work (will throw a warning)
with pytest.warns(UserWarning):
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
# test the size of the relevant candidates remained unchanged
- assert len(mykb._get_alias_candidates("douglas")) == 3
+ assert len(mykb.get_alias_candidates("douglas")) == 3
@pytest.mark.filterwarnings("ignore:\\[W036")
@@ -999,11 +999,11 @@ def test_kb_to_bytes():
assert kb_2.contains_alias("Russ Cochran")
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
- assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
- kb_2._get_alias_candidates("Russ Cochran")
+ assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
+ kb_2.get_alias_candidates("Russ Cochran")
)
- assert len(kb_1._get_alias_candidates("Randomness")) == len(
- kb_2._get_alias_candidates("Randomness")
+ assert len(kb_1.get_alias_candidates("Randomness")) == len(
+ kb_2.get_alias_candidates("Randomness")
)
@@ -1084,6 +1084,7 @@ def test_scorer_links():
@pytest.mark.parametrize(
"name,config",
[
+ ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
],
)
@@ -1110,7 +1111,10 @@ def create_kb(vocab):
return mykb
entity_linker = nlp.add_pipe(name, config={"model": config})
- assert isinstance(entity_linker, EntityLinker)
+ if config["@architectures"] == "spacy.EntityLinker.v1":
+ assert isinstance(entity_linker, EntityLinker_v1)
+ else:
+ assert isinstance(entity_linker, EntityLinker)
entity_linker.set_kb(create_kb)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
@@ -1293,7 +1297,6 @@ def create_kb(vocab):
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
-def test_save_activations():
def test_save_activations():
nlp = English()
vector_length = 3
@@ -1309,7 +1312,7 @@ def create_kb(vocab):
# create artificial KB - assign same prior weight to the two russ cochran's
# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
- mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
+ mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 9a6bbc9fc60..c2b65977ac3 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,5 +1,4 @@
from typing import cast
-
import pytest
from numpy.testing import assert_almost_equal, assert_equal
from thinc.api import get_current_ops
@@ -10,7 +9,7 @@
from spacy.language import Language
from spacy.morphology import Morphology
from spacy.pipeline import TrainablePipe
-from spacy.tests.util import make_tempdir
+from spacy.attrs import MORPH
from spacy.tokens import Doc
from spacy.training import Example
@@ -78,12 +77,6 @@ def test_implicit_label():
nlp.initialize(get_examples=lambda: train_examples)
-def test_is_distillable():
- nlp = English()
- morphologizer = nlp.add_pipe("morphologizer")
- assert morphologizer.is_distillable
-
-
def test_no_resize():
nlp = Language()
morphologizer = nlp.add_pipe("morphologizer")
@@ -255,25 +248,3 @@ def test_save_activations():
}
assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
-
-
-def test_save_activations():
- nlp = English()
- morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
- train_examples = []
- for inst in TRAIN_DATA:
- train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
- nlp.initialize(get_examples=lambda: train_examples)
-
- doc = nlp("This is a test.")
- assert "morphologizer" not in doc.activations
-
- morphologizer.save_activations = True
- doc = nlp("This is a test.")
- assert "morphologizer" in doc.activations
- assert set(doc.activations["morphologizer"].keys()) == {
- "label_ids",
- "probabilities",
- }
- assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
- assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 9a798eae890..2e40d86ff48 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,5 +1,4 @@
from typing import cast
-
import pytest
from numpy.testing import assert_equal
@@ -8,17 +7,10 @@
from spacy.lang.en import English
from spacy.language import Language
from spacy.pipeline import TrainablePipe
-from spacy.pipeline import TrainablePipe
from spacy.tests.util import make_tempdir
from spacy.training import Example
-def test_is_distillable():
- nlp = English()
- senter = nlp.add_pipe("senter")
- assert senter.is_distillable
-
-
def test_label_types():
nlp = Language()
senter = nlp.add_pipe("senter")
@@ -134,26 +126,3 @@ def test_save_activations():
assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
assert doc.activations["senter"]["probabilities"].shape == (5, nO)
assert doc.activations["senter"]["label_ids"].shape == (5,)
-
-
-def test_save_activations():
- # Test if activations are correctly added to Doc when requested.
- nlp = English()
- senter = cast(TrainablePipe, nlp.add_pipe("senter"))
-
- train_examples = []
- for t in TRAIN_DATA:
- train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
-
- nlp.initialize(get_examples=lambda: train_examples)
- nO = senter.model.get_dim("nO")
-
- doc = nlp("This is a test.")
- assert "senter" not in doc.activations
-
- senter.save_activations = True
- doc = nlp("This is a test.")
- assert "senter" in doc.activations
- assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
- assert doc.activations["senter"]["probabilities"].shape == (5, nO)
- assert doc.activations["senter"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 05e814f0733..5deb323dd71 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,5 +1,4 @@
from typing import cast
-
import pytest
from numpy.testing import assert_almost_equal, assert_equal
from thinc.api import compounding, get_current_ops
@@ -9,7 +8,7 @@
from spacy.lang.en import English
from spacy.language import Language
from spacy.pipeline import TrainablePipe
-from spacy.training import Example
+from thinc.api import compounding
from ..util import make_tempdir
@@ -25,9 +24,7 @@ def test_issue4348():
optimizer = nlp.initialize()
for i in range(5):
losses = {}
- batches = util.minibatch(
- TRAIN_DATA, size=compounding(4.0, 32.0, 1.001).to_generator()
- )
+ batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(batch, sgd=optimizer, losses=losses)
@@ -240,52 +237,6 @@ def test_overfitting_IO():
assert doc3[0].tag_ != "N"
-def test_is_distillable():
- nlp = English()
- tagger = nlp.add_pipe("tagger")
- assert tagger.is_distillable
-
-
-def test_distill():
- teacher = English()
- teacher_tagger = teacher.add_pipe("tagger")
- train_examples = []
- for t in TRAIN_DATA:
- train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
-
- optimizer = teacher.initialize(get_examples=lambda: train_examples)
-
- for i in range(50):
- losses = {}
- teacher.update(train_examples, sgd=optimizer, losses=losses)
- assert losses["tagger"] < 0.00001
-
- student = English()
- student_tagger = student.add_pipe("tagger")
- student_tagger.min_tree_freq = 1
- student_tagger.initialize(
- get_examples=lambda: train_examples, labels=teacher_tagger.label_data
- )
-
- distill_examples = [
- Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
- ]
-
- for i in range(50):
- losses = {}
- student_tagger.distill(
- teacher_tagger, distill_examples, sgd=optimizer, losses=losses
- )
- assert losses["tagger"] < 0.00001
-
- test_text = "I like blue eggs"
- doc = student(test_text)
- assert doc[0].tag_ == "N"
- assert doc[1].tag_ == "V"
- assert doc[2].tag_ == "J"
- assert doc[3].tag_ == "N"
-
-
def test_save_activations():
# Test if activations are correctly added to Doc when requested.
nlp = English()
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index f834597fafe..710dac0571d 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,5 +1,5 @@
-import random
from typing import cast
+import random
import numpy.random
import pytest
@@ -13,16 +13,12 @@
from spacy.lang.en import English
from spacy.language import Language
from spacy.pipeline import TextCategorizer, TrainablePipe
-from spacy.pipeline.textcat import (
- single_label_bow_config,
- single_label_cnn_config,
- single_label_default_config,
-)
-from spacy.pipeline.textcat_multilabel import (
- multi_label_bow_config,
- multi_label_cnn_config,
- multi_label_default_config,
-)
+from spacy.pipeline.textcat import single_label_bow_config
+from spacy.pipeline.textcat import single_label_cnn_config
+from spacy.pipeline.textcat import single_label_default_config
+from spacy.pipeline.textcat_multilabel import multi_label_bow_config
+from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
+from spacy.pipeline.textcat_multilabel import multi_label_default_config
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
from spacy.tokens import Doc, DocBin
@@ -104,9 +100,7 @@ def test_issue3611():
optimizer = nlp.initialize()
for i in range(3):
losses = {}
- batches = util.minibatch(
- train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
- )
+ batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@@ -143,9 +137,7 @@ def test_issue4030():
optimizer = nlp.initialize()
for i in range(3):
losses = {}
- batches = util.minibatch(
- train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
- )
+ batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@@ -304,7 +296,6 @@ def test_issue9904():
examples = get_examples()
scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
- scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
loss = textcat.get_loss(examples, scores)[0]
loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
@@ -474,8 +465,6 @@ def test_no_resize(name, textcat_config):
# CNN
("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
- ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
- ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
],
)
# fmt: on
@@ -613,12 +602,6 @@ def test_initialize_examples(name, get_examples, train_data):
nlp.initialize(get_examples=get_examples())
-def test_is_distillable():
- nlp = English()
- textcat = nlp.add_pipe("textcat")
- assert not textcat.is_distillable
-
-
def test_overfitting_IO():
# Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
fix_random_seed(0)
@@ -963,11 +946,9 @@ def test_textcat_multi_threshold():
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
-def test_save_activations():
def test_save_activations():
nlp = English()
textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
- textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
train_examples = []
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
@@ -984,34 +965,6 @@ def test_save_activations():
assert doc.activations["textcat"]["probabilities"].shape == (nO,)
-def test_save_activations_multi():
- nlp = English()
- textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
-
- train_examples = []
- for text, annotations in TRAIN_DATA_MULTI_LABEL:
- train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
- nlp.initialize(get_examples=lambda: train_examples)
- nO = textcat.model.get_dim("nO")
-
- doc = nlp("This is a test.")
- assert "textcat_multilabel" not in doc.activations
-
- textcat.save_activations = True
- doc = nlp("This is a test.")
- assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
- assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
- nO = textcat.model.get_dim("nO")
-
- doc = nlp("This is a test.")
- assert "textcat" not in doc.activations
-
- textcat.save_activations = True
- doc = nlp("This is a test.")
- assert list(doc.activations["textcat"].keys()) == ["probabilities"]
- assert doc.activations["textcat"]["probabilities"].shape == (nO,)
-
-
def test_save_activations_multi():
nlp = English()
textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 9fb6a72c87f..fc0404f1423 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -50,6 +50,8 @@ cdef class Doc:
cdef public dict activations
+ cdef public dict activations
+
cdef public dict user_hooks
cdef public dict user_token_hooks
cdef public dict user_span_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index dc7c0143029..5fda6f2f789 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -8,6 +8,7 @@ from typing import (
List,
Optional,
Protocol,
+ Sequence,
Tuple,
Union,
overload,
@@ -16,15 +17,20 @@ from typing import (
import numpy as np
from cymem.cymem import Pool
from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
-
+from .span import Span
+from .token import Token
+from .span_groups import SpanGroups
+from .retokenizer import Retokenizer
from ..lexeme import Lexeme
from ..vocab import Vocab
-from .retokenizer import Retokenizer
+from ._dict_proxies import SpanGroups
+from ._retokenize import Retokenizer
from .span import Span
-from .span_groups import SpanGroups
from .token import Token
from .underscore import Underscore
+DOCBIN_ALL_ATTRS: Tuple[str, ...]
+
class DocMethod(Protocol):
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
@@ -34,6 +40,7 @@ class Doc:
spans: SpanGroups
max_length: int
length: int
+ sentiment: float
activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
cats: Dict[str, float]
user_hooks: Dict[str, Callable[..., Any]]
@@ -118,7 +125,6 @@ class Doc:
start_idx: int,
end_idx: int,
label: Union[int, str] = ...,
- *,
kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ...,
alignment_mode: str = ...,
@@ -146,12 +152,12 @@ class Doc:
blocked: Optional[List[Span]] = ...,
missing: Optional[List[Span]] = ...,
outside: Optional[List[Span]] = ...,
- default: str = ...,
+ default: str = ...
) -> None: ...
@property
- def noun_chunks(self) -> Tuple[Span]: ...
+ def noun_chunks(self) -> Iterator[Span]: ...
@property
- def sents(self) -> Tuple[Span]: ...
+ def sents(self) -> Iterator[Span]: ...
@property
def lang(self) -> int: ...
@property
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index e92c0e833e0..310ce0dc88d 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -214,7 +214,6 @@ alignment mode `"strict".
| `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
-| _keyword-only_ | |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
@@ -654,10 +653,11 @@ the [`TextCategorizer`](/api/textcategorizer).
## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
-Returns a tuple of the base noun phrases in the doc, if the document has been
-syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
-does not permit other NPs to be nested within it – so no NP-level coordination,
-no prepositional phrases, and no relative clauses.
+Iterate over the base noun phrases in the document. Yields base noun-phrase
+`Span` objects, if the document has been syntactically parsed. A base noun
+phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
+nested within it – so no NP-level coordination, no prepositional phrases, and no
+relative clauses.
To customize the noun chunk iterator in a loaded pipeline, modify
[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@@ -674,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
> assert chunks[1].text == "another phrase"
> ```
-| Name | Description |
-| ----------- | -------------------------------------------- |
-| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
+| Name | Description |
+| ---------- | ------------------------------------- |
+| **YIELDS** | Noun chunks in the document. ~~Span~~ |
## Doc.sents {id="sents",tag="property",model="sentences"}
-Returns a tuple of the sentences in the document. Sentence spans have no label.
+Iterate over the sentences in the document. Sentence spans have no label.
This property is only available when
[sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@@ -696,9 +696,9 @@ will raise an error otherwise.
> assert [s.root.text for s in sents] == ["is", "'s"]
> ```
-| Name | Description |
-| ----------- | ------------------------------------------ |
-| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
+| Name | Description |
+| ---------- | ----------------------------------- |
+| **YIELDS** | Sentences in the document. ~~Span~~ |
## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
@@ -762,6 +762,7 @@ The L2 norm of the document's vector representation.
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
| `lang` 2.1 | Language of the document's vocabulary. ~~int~~ |
| `lang_` 2.1 | Language of the document's vocabulary. ~~str~~ |
+| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
@@ -785,6 +786,7 @@ serialization by passing in the string names via the `exclude` argument.
| Name | Description |
| ------------------ | --------------------------------------------- |
| `text` | The value of the `Doc.text` attribute. |
+| `sentiment` | The value of the `Doc.sentiment` attribute. |
| `tensor` | The value of the `Doc.tensor` attribute. |
| `user_data` | The value of the `Doc.user_data` dictionary. |
| `user_data_keys` | The keys of the `Doc.user_data` dictionary. |
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index f4b83d88bbf..fe720a60dd1 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -53,21 +53,20 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("entity_linker", config=config)
> ```
-| Setting | Description |
-| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
-| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
-| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
-| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
-| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
-| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~ |
-| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
-| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
-| `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
-| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
-| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
-| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
+| Setting | Description |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
+| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
+| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
+| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
+| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
+| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
+| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
+| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
+| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index 9514bc773b9..1fda807cb32 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
| Setting | Description |
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ |
+| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
@@ -121,39 +121,6 @@ delegate to the [`predict`](/api/morphologizer#predict) and
| `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | The processed document. ~~Doc~~ |
-## Morphologizer.distill {id="distill", tag="method,experimental", version="4"}
-
-Train a pipe (the student) on the predictions of another pipe (the teacher). The
-student is typically trained on the probability distribution of the teacher, but
-details may differ per pipe. The goal of distillation is to transfer knowledge
-from the teacher to the student.
-
-The distillation is performed on ~~Example~~ objects. The `Example.reference`
-and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
-same orthography. Even though the reference does not need have to have gold
-annotations, the teacher could adds its own annotations when necessary.
-
-This feature is experimental.
-
-> #### Example
->
-> ```python
-> teacher_pipe = teacher.add_pipe("morphologizer")
-> student_pipe = student.add_pipe("morphologizer")
-> optimizer = nlp.resume_training()
-> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
-> ```
-
-| Name | Description |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ |
-| `examples` | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ |
-| _keyword-only_ | |
-| `drop` | Dropout rate. ~~float~~ |
-| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
-| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
-| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
-
## Morphologizer.pipe {id="pipe",tag="method"}
Apply the pipe to a stream of documents. This usually happens under the hood
@@ -292,27 +259,6 @@ predicted scores.
| `scores` | Scores representing the model's predictions. |
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
-## Morphologizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
-
-Calculate the loss and its gradient for the batch of student scores relative to
-the teacher scores.
-
-> #### Example
->
-> ```python
-> teacher_morphologizer = teacher.get_pipe("morphologizer")
-> student_morphologizer = student.add_pipe("morphologizer")
-> student_scores = student_morphologizer.predict([eg.predicted for eg in examples])
-> teacher_scores = teacher_morphologizer.predict([eg.predicted for eg in examples])
-> loss, d_loss = student_morphologizer.get_teacher_student_loss(teacher_scores, student_scores)
-> ```
-
-| Name | Description |
-| ---------------- | --------------------------------------------------------------------------- |
-| `teacher_scores` | Scores representing the teacher model's predictions. |
-| `student_scores` | Scores representing the student model's predictions. |
-| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
-
## Morphologizer.create_optimizer {id="create_optimizer",tag="method"}
Create an optimizer for the pipeline component.