diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 54c880a7d89..2ef639cad52 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -1,10 +1,10 @@ from collections import Counter from itertools import islice -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast import numpy as np import srsly -from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy +from thinc.api import Config, Model, SequenceCategoricalCrossentropy from thinc.types import ArrayXd, Floats2d, Ints1d from .. import util @@ -18,6 +18,10 @@ from .lemmatizer import lemmatizer_score from .trainable_pipe import TrainablePipe +# The cutoff value of *top_k* above which an alternative method is used to process guesses. +TOP_K_GUARDRAIL = 20 + + ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] @@ -50,7 +54,6 @@ "top_k": 1, "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, "save_activations": False, - "save_activations": False, }, default_score_weights={"lemma_acc": 1.0}, ) @@ -64,7 +67,6 @@ def make_edit_tree_lemmatizer( top_k: int, scorer: Optional[Callable], save_activations: bool, - save_activations: bool, ): """Construct an EditTreeLemmatizer component.""" return EditTreeLemmatizer( @@ -77,7 +79,6 @@ def make_edit_tree_lemmatizer( top_k=top_k, scorer=scorer, save_activations=save_activations, - save_activations=save_activations, ) @@ -98,7 +99,6 @@ def __init__( top_k: int = 1, scorer: Optional[Callable] = lemmatizer_score, save_activations: bool = False, - save_activations: bool = False, ): """ Construct an edit tree lemmatizer. @@ -111,7 +111,6 @@ def __init__( overwrite (bool): overwrite existing lemma annotations. top_k (int): try to apply at most the k most probable edit trees. save_activations (bool): save model activations in Doc when annotating. - save_activations (bool): save model activations in Doc when annotating. """ self.vocab = vocab self.model = model @@ -127,7 +126,6 @@ def __init__( self.cfg: Dict[str, Any] = {"labels": []} self.scorer = scorer self.save_activations = save_activations - self.save_activations = save_activations def get_loss( self, examples: Iterable[Example], scores: List[Floats2d] @@ -156,25 +154,6 @@ def get_loss( return float(loss), d_scores - def get_teacher_student_loss( - self, teacher_scores: List[Floats2d], student_scores: List[Floats2d] - ) -> Tuple[float, List[Floats2d]]: - """Calculate the loss and its gradient for a batch of student - scores, relative to teacher scores. - - teacher_scores: Scores representing the teacher model's predictions. - student_scores: Scores representing the student model's predictions. - - RETURNS (Tuple[float, float]): The loss and the gradient. - - DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss - """ - loss_func = SequenceCategoricalCrossentropy(normalize=False) - d_scores, loss = loss_func(student_scores, teacher_scores) - if self.model.ops.xp.isnan(loss): - raise ValueError(Errors.E910.format(name=self.name)) - return float(loss), d_scores - def predict(self, docs: Iterable[Doc]) -> ActivationsT: n_docs = len(list(docs)) if not any(len(doc) for doc in docs): @@ -186,21 +165,13 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT: scores: List[Floats2d] = [ self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs ] - guesses: List[Ints1d] = [ - self.model.ops.alloc((0,), dtype="i") for doc in docs - ] - scores: List[Floats2d] = [ - self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs - ] assert len(guesses) == n_docs return {"probabilities": scores, "tree_ids": guesses} - return {"probabilities": scores, "tree_ids": guesses} scores = self.model.predict(docs) assert len(scores) == n_docs guesses = scores2guesses(docs, scores) assert len(guesses) == n_docs return {"probabilities": scores, "tree_ids": guesses} - return {"probabilities": scores, "tree_ids": guesses} def _scores2guesses_top_k_equals_1(self, docs, scores): guesses = [] @@ -260,15 +231,9 @@ def _scores2guesses_top_k_guardrail(self, docs, scores): return guesses - def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): - batch_tree_ids = activations["tree_ids"] def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): batch_tree_ids = activations["tree_ids"] for i, doc in enumerate(docs): - if self.save_activations: - doc.activations[self.name] = {} - for act_name, acts in activations.items(): - doc.activations[self.name][act_name] = acts[i] if self.save_activations: doc.activations[self.name] = {} for act_name, acts in activations.items(): diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index eb87d1db987..0f15ef38d45 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -6,37 +6,24 @@ from itertools import islice import srsly import random -import warnings from itertools import islice from pathlib import Path -from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Union, cast +from typing import Any, Callable, Dict, Iterable, List, Optional, Union import srsly -from numpy import dtype from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate -from thinc.types import Floats1d, Floats2d, Ints1d, Ragged +from thinc.types import Floats2d -from ..kb import KnowledgeBase, Candidate -from ..ml import empty_kb -from ..tokens import Doc, Span -from ..ml import empty_kb -from ..tokens import Doc, Span, SpanGroup -from .pipe import deserialize_config -from .trainable_pipe import TrainablePipe -from ..language import Language -from ..vocab import Vocab -from ..training import Example, validate_examples, validate_get_examples -from ..errors import Errors -from ..util import SimpleFrozenList, registry from .. import util from ..errors import Errors from ..kb import Candidate, KnowledgeBase from ..language import Language from ..scorer import Scorer -from ..tokens import Doc, Span, SpanGroup +from ..tokens import Doc, Span from ..training import Example, validate_examples, validate_get_examples from ..util import SimpleFrozenList, registry from ..vocab import Vocab +from .legacy.entity_linker import EntityLinker_v1 from .pipe import deserialize_config from .trainable_pipe import TrainablePipe @@ -45,6 +32,9 @@ KNOWLEDGE_BASE_IDS = "kb_ids" +# See #9050 +BACKWARD_OVERWRITE = True + default_model_config = """ [model] @architectures = "spacy.EntityLinker.v2" @@ -75,13 +65,13 @@ "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, - "overwrite": False, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, + "overwrite": True, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, "candidates_batch_size": 1, "threshold": None, "save_activations": False, - "save_activations": False, }, default_score_weights={ "nel_micro_f": 1.0, @@ -101,7 +91,7 @@ def make_entity_linker( entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_batch: Callable[ - [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] + [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, @@ -110,7 +100,6 @@ def make_entity_linker( candidates_batch_size: int, threshold: Optional[float] = None, save_activations: bool, - save_activations: bool, ): """Construct an EntityLinker component. @@ -125,7 +114,7 @@ def make_entity_linker( get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. get_candidates_batch ( - Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]] + Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. @@ -135,11 +124,23 @@ def make_entity_linker( threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. save_activations (bool): save model activations in Doc when annotating. - save_activations (bool): save model activations in Doc when annotating. """ - if not model.attrs.get("include_span_maker", False): - raise ValueError(Errors.E4005) + if not model.attrs.get("include_span_maker", False): + # The only difference in arguments here is that use_gold_ents and threshold aren't available. + return EntityLinker_v1( + nlp.vocab, + model, + name, + labels_discard=labels_discard, + n_sents=n_sents, + incl_prior=incl_prior, + incl_context=incl_context, + entity_vector_length=entity_vector_length, + get_candidates=get_candidates, + overwrite=overwrite, + scorer=scorer, + ) return EntityLinker( nlp.vocab, model, @@ -158,7 +159,6 @@ def make_entity_linker( candidates_batch_size=candidates_batch_size, threshold=threshold, save_activations=save_activations, - save_activations=save_activations, ) @@ -192,15 +192,15 @@ def __init__( entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_batch: Callable[ - [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] + [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], - overwrite: bool = False, + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], + overwrite: bool = BACKWARD_OVERWRITE, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, candidates_batch_size: int, threshold: Optional[float] = None, save_activations: bool = False, - save_activations: bool = False, ) -> None: """Initialize an entity linker. @@ -216,10 +216,10 @@ def __init__( get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. get_candidates_batch ( - Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], + Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. - overwrite (bool): Whether to overwrite existing non-empty annotations. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -255,12 +255,9 @@ def __init__( self.candidates_batch_size = candidates_batch_size self.threshold = threshold self.save_activations = save_activations - self.save_activations = save_activations if candidates_batch_size < 1: raise ValueError(Errors.E1044) - if self.incl_prior and not self.kb.supports_prior_probs: - warnings.warn(Warnings.W401) def _score_with_ents_set(examples: Iterable[Example], **kwargs): # Because of how spaCy works, we can't just score immediately, because Language.evaluate @@ -463,7 +460,6 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): loss = loss / len(entity_encodings) return float(loss), out - def predict(self, docs: Iterable[Doc]) -> ActivationsT: def predict(self, docs: Iterable[Doc]) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. Returns the KB IDs for each entity in each doc, including NIL if there is @@ -481,47 +477,39 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT: xp = ops.xp docs_ents: List[Ragged] = [] docs_scores: List[Ragged] = [] - ops = self.model.ops - xp = ops.xp - docs_ents: List[Ragged] = [] - docs_scores: List[Ragged] = [] if not docs: - return { - KNOWLEDGE_BASE_IDS: final_kb_ids, - "ents": docs_ents, - "scores": docs_scores, - } + return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores} if isinstance(docs, Doc): docs = [docs] - for doc in docs: - doc_ents: List[Ints1d] = [] - doc_scores: List[Floats1d] = [] for doc in docs: doc_ents: List[Ints1d] = [] doc_scores: List[Floats1d] = [] if len(doc) == 0: - docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0))) - docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0))) docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0))) docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0))) continue sentences = [s for s in doc.sents] - # Loop over entities in batches. - for ent_idx in range(0, len(doc.ents), self.candidates_batch_size): - ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size] - - # Look up candidate entities. - valid_ent_idx = [ - idx - for idx in range(len(ent_batch)) - if ent_batch[idx].label_ not in self.labels_discard - ] - - batch_candidates = list( - self.get_candidates_batch( - self.kb, - SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]), + if self.incl_context: + # get n_neighbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + sent_doc = doc[start_token:end_token].as_doc() + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model.predict([sent_doc])[0] + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + entity_count += 1 + if ent.label_ in self.labels_discard: + # ignoring this entity - setting to NIL + final_kb_ids.append(self.NIL) + self._add_activations( + doc_scores=doc_scores, + doc_ents=doc_ents, + scores=[0.0], + ents=[0], ) else: candidates = list(self.get_candidates(self.kb, ent)) @@ -592,39 +580,23 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT: method="predict", msg="result variables not of equal length" ) raise RuntimeError(err) - return { - KNOWLEDGE_BASE_IDS: final_kb_ids, - "ents": docs_ents, - "scores": docs_scores, - } + return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores} - def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. activations (ActivationsT): The activations used for setting annotations, produced by EntityLinker.predict. - activations (ActivationsT): The activations used for setting annotations, produced - by EntityLinker.predict. DOCS: https://spacy.io/api/entitylinker#set_annotations """ kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS]) - kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS]) count_ents = len([ent for doc in docs for ent in doc.ents]) if count_ents != len(kb_ids): raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) i = 0 overwrite = self.cfg["overwrite"] - for j, doc in enumerate(docs): - if self.save_activations: - doc.activations[self.name] = {} - for act_name, acts in activations.items(): - if act_name != KNOWLEDGE_BASE_IDS: - # We only copy activations that are Ragged. - doc.activations[self.name][act_name] = cast(Ragged, acts[j]) - for j, doc in enumerate(docs): if self.save_activations: doc.activations[self.name] = {} @@ -760,32 +732,3 @@ def _add_activations( ops = self.model.ops doc_scores.append(ops.asarray1f(scores)) doc_ents.append(ops.asarray1i(ents, dtype="uint64")) - - def _add_doc_activations( - self, - *, - docs_scores: List[Ragged], - docs_ents: List[Ragged], - doc_scores: List[Floats1d], - doc_ents: List[Ints1d], - ): - if not self.save_activations: - return - ops = self.model.ops - lengths = ops.asarray1i([s.shape[0] for s in doc_scores]) - docs_scores.append(Ragged(ops.flatten(doc_scores), lengths)) - docs_ents.append(Ragged(ops.flatten(doc_ents), lengths)) - - def _add_activations( - self, - *, - doc_scores: List[Floats1d], - doc_ents: List[Ints1d], - scores: Sequence[float], - ents: Sequence[int], - ): - if not self.save_activations: - return - ops = self.model.ops - doc_scores.append(ops.asarray1f(scores)) - doc_ents.append(ops.asarray1i(ents, dtype="uint64")) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 443b6818dc2..cc8f87936b9 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,11 +1,10 @@ # cython: infer_types=True, profile=True, binding=True from typing import Callable, Dict, Iterable, List, Optional, Union import srsly -from thinc.api import Model, Config -from thinc.legacy import LegacySequenceCategoricalCrossentropy +from thinc.api import SequenceCategoricalCrossentropy, Model, Config from thinc.types import Floats2d, Ints1d from itertools import islice -from typing import Callable, Dict, Iterable, Optional, Union +from typing import Callable, Dict, Optional, Union from thinc.api import Config, Model, SequenceCategoricalCrossentropy @@ -24,10 +23,13 @@ from ..errors import Errors from ..language import Language from ..parts_of_speech import IDS as POS_IDS from ..scorer import Scorer -from ..symbols import POS from ..training import validate_examples, validate_get_examples from ..util import registry -from .tagger import ActivationsT, Tagger +from .tagger import Tagger + +# See #9050 +BACKWARD_OVERWRITE = True +BACKWARD_EXTEND = False default_model_config = """ [model] @@ -65,13 +67,6 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "save_activations": False, }, - default_config={ - "model": DEFAULT_MORPH_MODEL, - "overwrite": True, - "extend": False, - "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, - "save_activations": False, - }, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( @@ -83,12 +78,9 @@ def make_morphologizer( label_smoothing: float, scorer: Optional[Callable], save_activations: bool, - save_activations: bool, ): return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer, save_activations=save_activations) - return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer, - save_activations=save_activations) def morphologizer_score(examples, **kwargs): @@ -120,11 +112,11 @@ class Morphologizer(Tagger): model: Model, name: str = "morphologizer", *, - overwrite: bool = False, - extend: bool = False, + overwrite: bool = BACKWARD_OVERWRITE, + extend: bool = BACKWARD_EXTEND, + label_smoothing: float = 0.0, scorer: Optional[Callable] = morphologizer_score, save_activations: bool = False, - save_activations: bool = False, ): """Initialize a morphologizer. @@ -132,13 +124,10 @@ class Morphologizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - overwrite (bool): Whether to overwrite existing annotations. - extend (bool): Whether to extend existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attributes "pos" and "morph" and Scorer.score_token_attr_per_feat for the attribute "morph". save_activations (bool): save model activations in Doc when annotating. - save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/morphologizer#init """ @@ -160,7 +149,6 @@ class Morphologizer(Tagger): self.cfg = dict(sorted(cfg.items())) self.scorer = scorer self.save_activations = save_activations - self.save_activations = save_activations @property def labels(self): @@ -254,18 +242,15 @@ class Morphologizer(Tagger): assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict. - activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict. DOCS: https://spacy.io/api/morphologizer#set_annotations """ batch_tag_ids = activations["label_ids"] - batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc @@ -276,10 +261,6 @@ class Morphologizer(Tagger): # to allocate a compatible container out of the iterable. labels = tuple(self.labels) for i, doc in enumerate(docs): - if self.save_activations: - doc.activations[self.name] = {} - for act_name, acts in activations.items(): - doc.activations[self.name][act_name] = acts[i] if self.save_activations: doc.activations[self.name] = {} for act_name, acts in activations.items(): @@ -321,8 +302,7 @@ class Morphologizer(Tagger): DOCS: https://spacy.io/api/morphologizer#get_loss """ validate_examples(examples, "Morphologizer.get_loss") - loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, - label_smoothing=self.cfg["label_smoothing"]) + loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False) truths = [] for eg in examples: eg_truths = [] diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 35627bbf2ad..521afe1d181 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,18 +1,25 @@ # cython: infer_types=True, profile=True, binding=True +from typing import Dict, Iterable, Optional, Callable, List, Union from itertools import islice -from typing import Callable, Dict, Iterable, List, Optional, Union +from typing import Callable, Optional -from thinc.api import Config, Model, SequenceCategoricalCrossentropy +import srsly +from thinc.api import Model, SequenceCategoricalCrossentropy, Config +from thinc.types import Floats2d, Ints1d from ..tokens.doc cimport Doc -from .. import util +from .tagger import ActivationsT, Tagger +from ..language import Language from ..errors import Errors from ..language import Language from ..scorer import Scorer from ..training import validate_examples, validate_get_examples from ..util import registry -from .tagger import ActivationsT, Tagger +from .tagger import Tagger + +# See #9050 +BACKWARD_OVERWRITE = False default_model_config = """ [model] @@ -40,12 +47,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] "scorer": {"@scorers": "spacy.senter_scorer.v1"}, "save_activations": False, }, - default_config={ - "model": DEFAULT_SENTER_MODEL, - "overwrite": False, - "scorer": {"@scorers": "spacy.senter_scorer.v1"}, - "save_activations": False, - }, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) def make_senter(nlp: Language, @@ -55,13 +56,6 @@ def make_senter(nlp: Language, scorer: Optional[Callable], save_activations: bool): return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations) -def make_senter(nlp: Language, - name: str, - model: Model, - overwrite: bool, - scorer: Optional[Callable], - save_activations: bool): - return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations) def senter_score(examples, **kwargs): @@ -89,10 +83,9 @@ class SentenceRecognizer(Tagger): model, name="senter", *, - overwrite=False, + overwrite=BACKWARD_OVERWRITE, scorer=senter_score, save_activations: bool = False, - save_activations: bool = False, ): """Initialize a sentence recognizer. @@ -100,11 +93,9 @@ class SentenceRecognizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". save_activations (bool): save model activations in Doc when annotating. - save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/sentencerecognizer#init """ @@ -115,7 +106,6 @@ class SentenceRecognizer(Tagger): self.cfg = {"overwrite": overwrite} self.scorer = scorer self.save_activations = save_activations - self.save_activations = save_activations @property def labels(self): @@ -133,27 +123,20 @@ class SentenceRecognizer(Tagger): def label_data(self): return None - def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict. - activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict. DOCS: https://spacy.io/api/sentencerecognizer#set_annotations """ batch_tag_ids = activations["label_ids"] - batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef bint overwrite = self.cfg["overwrite"] for i, doc in enumerate(docs): - if self.save_activations: - doc.activations[self.name] = {} - for act_name, acts in activations.items(): - doc.activations[self.name][act_name] = acts[i] if self.save_activations: doc.activations[self.name] = {} for act_name, acts in activations.items(): diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 9d9415692a8..1450bb5d6cb 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,29 +1,14 @@ -from dataclasses import dataclass -from functools import partial -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Optional, - Protocol, - Tuple, - Union, - cast, - runtime_checkable, -) +from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast +from typing import Union +from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops +from thinc.api import Optimizer +from thinc.types import Ragged, Ints2d, Floats2d, Ints1d import numpy from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate from thinc.types import Floats2d, Ints1d, Ints2d, Ragged -from ..scorer import Scorer -from ..language import Language -from .trainable_pipe import TrainablePipe -from ..tokens import Doc, SpanGroup, Span -from ..vocab import Vocab -from ..training import Example, validate_examples +from ..compat import Protocol, runtime_checkable from ..errors import Errors from ..language import Language from ..scorer import Scorer @@ -36,9 +21,6 @@ ActivationsT = Dict[str, Union[Floats2d, Ragged]] -ActivationsT = Dict[str, Union[Floats2d, Ragged]] - - spancat_default_config = """ [model] @architectures = "spacy.SpanCategorizer.v1" @@ -194,7 +176,6 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester: "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, "save_activations": False, - "save_activations": False, }, default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, ) @@ -208,7 +189,6 @@ def make_spancat( threshold: float, max_positive: Optional[int], save_activations: bool, - save_activations: bool, ) -> "SpanCategorizer": """Create a SpanCategorizer component and configure it for multi-label classification to be able to assign multiple labels for each span. @@ -237,7 +217,6 @@ def make_spancat( max_positive (Optional[int]): Maximum number of labels to consider positive per span. Defaults to None, indicating no limit. save_activations (bool): save model activations in Doc when annotating. - save_activations (bool): save model activations in Doc when annotating. """ return SpanCategorizer( nlp.vocab, @@ -317,7 +296,6 @@ def make_spancat_singlelabel( threshold=None, scorer=scorer, save_activations=save_activations, - save_activations=save_activations, ) @@ -381,7 +359,6 @@ def __init__( threshold: Optional[float] = 0.5, scorer: Optional[Callable] = spancat_score, save_activations: bool = False, - save_activations: bool = False, ) -> None: """Initialize the multi-label or multi-class span categorizer. @@ -432,7 +409,6 @@ def __init__( self.name = name self.scorer = scorer self.save_activations = save_activations - self.save_activations = save_activations @property def key(self) -> str: @@ -490,7 +466,6 @@ def label_data(self) -> List[str]: """ return list(self.labels) - def predict(self, docs: Iterable[Doc]) -> ActivationsT: def predict(self, docs: Iterable[Doc]) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. @@ -502,8 +477,6 @@ def predict(self, docs: Iterable[Doc]) -> ActivationsT: indices = self.suggester(docs, ops=self.model.ops) scores = self.model.predict((docs, indices)) # type: ignore return {"indices": indices, "scores": scores} - scores = self.model.predict((docs, indices)) # type: ignore - return {"indices": indices, "scores": scores} def set_candidates( self, docs: Iterable[Doc], *, candidates_key: str = "candidates" @@ -523,13 +496,11 @@ def set_candidates( for index in candidates.dataXd: doc.spans[candidates_key].append(doc[index[0] : index[1]]) - def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: """Modify a batch of Doc objects, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. activations: ActivationsT: The activations, produced by SpanCategorizer.predict. - activations: ActivationsT: The activations, produced by SpanCategorizer.predict. DOCS: https://spacy.io/api/spancategorizer#set_annotations """ @@ -538,9 +509,10 @@ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> Non indices = activations["indices"] assert isinstance(indices, Ragged) scores = cast(Floats2d, activations["scores"]) + offset = 0 for i, doc in enumerate(docs): - indices_i = cast(Ints2d, indices[i].dataXd) + indices_i = indices[i].dataXd if self.save_activations: doc.activations[self.name] = {} doc.activations[self.name]["indices"] = indices_i diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index ccd401b6af9..8ecd0c46ee0 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -1,37 +1,32 @@ # cython: infer_types=True, profile=True, binding=True from typing import Callable, Dict, Iterable, List, Optional, Union -from typing import Tuple import numpy import srsly -from thinc.api import Model, set_dropout_rate, Config -from thinc.legacy import LegacySequenceCategoricalCrossentropy +from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config from thinc.types import Floats2d, Ints1d import warnings from itertools import islice -from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union +from typing import Callable, Optional import numpy from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate -from thinc.types import Floats2d, Ints1d -from ..morphology cimport Morphology from ..tokens.doc cimport Doc -from ..vocab cimport Vocab from .. import util -from ..attrs import ID, POS -from ..errors import Errors, Warnings +from ..errors import Errors from ..language import Language -from ..parts_of_speech import X from ..scorer import Scorer from ..training import validate_examples, validate_get_examples from ..util import registry -from .pipe import deserialize_config from .trainable_pipe import TrainablePipe ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] +# See #9050 +BACKWARD_OVERWRITE = False + default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -59,13 +54,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] "neg_prefix": "!", "save_activations": False, }, - default_config={ - "model": DEFAULT_TAGGER_MODEL, - "overwrite": False, - "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, - "neg_prefix": "!", - "save_activations": False, - }, default_score_weights={"tag_acc": 1.0}, ) def make_tagger( @@ -76,7 +64,6 @@ def make_tagger( scorer: Optional[Callable], neg_prefix: str, save_activations: bool, - save_activations: bool, ): """Construct a part-of-speech tagger component. @@ -87,8 +74,6 @@ def make_tagger( """ return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, save_activations=save_activations) - return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, - save_activations=save_activations) def tagger_score(examples, **kwargs): @@ -111,11 +96,10 @@ class Tagger(TrainablePipe): model, name="tagger", *, - overwrite=False, + overwrite=BACKWARD_OVERWRITE, scorer=tagger_score, neg_prefix="!", save_activations: bool = False, - save_activations: bool = False, ): """Initialize a part-of-speech tagger. @@ -123,11 +107,9 @@ class Tagger(TrainablePipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attribute "tag". save_activations (bool): save model activations in Doc when annotating. - save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/tagger#init """ @@ -139,7 +121,6 @@ class Tagger(TrainablePipe): self.cfg = dict(sorted(cfg.items())) self.scorer = scorer self.save_activations = save_activations - self.save_activations = save_activations @property def labels(self): @@ -158,7 +139,6 @@ class Tagger(TrainablePipe): """Data about the labels currently added to the component.""" return tuple(self.cfg["labels"]) - def predict(self, docs) -> ActivationsT: def predict(self, docs) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. @@ -173,13 +153,11 @@ class Tagger(TrainablePipe): guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] assert len(guesses) == len(docs) return {"probabilities": guesses, "label_ids": guesses} - return {"probabilities": guesses, "label_ids": guesses} scores = self.model.predict(docs) assert len(scores) == len(docs), (len(scores), len(docs)) guesses = self._scores2guesses(scores) assert len(guesses) == len(docs) return {"probabilities": scores, "label_ids": guesses} - return {"probabilities": scores, "label_ids": guesses} def _scores2guesses(self, scores): guesses = [] @@ -190,28 +168,21 @@ class Tagger(TrainablePipe): guesses.append(doc_guesses) return guesses - def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict. - activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict. DOCS: https://spacy.io/api/tagger#set_annotations """ batch_tag_ids = activations["label_ids"] - batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef bint overwrite = self.cfg["overwrite"] labels = self.labels for i, doc in enumerate(docs): - if self.save_activations: - doc.activations[self.name] = {} - for act_name, acts in activations.items(): - doc.activations[self.name][act_name] = acts[i] if self.save_activations: doc.activations[self.name] = {} for act_name, acts in activations.items(): @@ -271,6 +242,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#rehearse """ + loss_func = SequenceCategoricalCrossentropy() if losses is None: losses = {} losses.setdefault(self.name, 0.0) @@ -284,32 +256,12 @@ class Tagger(TrainablePipe): set_dropout_rate(self.model, drop) tag_scores, bp_tag_scores = self.model.begin_update(docs) tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs) - loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores) + grads, loss = loss_func(tag_scores, tutor_tag_scores) bp_tag_scores(grads) - if sgd is not None: - self.finish_update(sgd) + self.finish_update(sgd) losses[self.name] += loss return losses - def get_teacher_student_loss( - self, teacher_scores: List[Floats2d], student_scores: List[Floats2d] - ) -> Tuple[float, List[Floats2d]]: - """Calculate the loss and its gradient for a batch of student - scores, relative to teacher scores. - - teacher_scores: Scores representing the teacher model's predictions. - student_scores: Scores representing the student model's predictions. - - RETURNS (Tuple[float, float]): The loss and the gradient. - - DOCS: https://spacy.io/api/tagger#get_teacher_student_loss - """ - loss_func = SequenceCategoricalCrossentropy(normalize=False) - d_scores, loss = loss_func(student_scores, teacher_scores) - if self.model.ops.xp.isnan(loss): - raise ValueError(Errors.E910.format(name=self.name)) - return float(loss), d_scores - def get_loss(self, examples, scores): """Find the loss and gradient of loss for the batch of documents and their predicted scores. @@ -321,12 +273,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#get_loss """ validate_examples(examples, "Tagger.get_loss") - loss_func = SequenceCategoricalCrossentropy( - names=self.labels, - normalize=False, - neg_prefix=self.cfg["neg_prefix"], - label_smoothing=self.cfg["label_smoothing"] - ) + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"]) # Convert empty tag "" to missing value None so that both misaligned # tokens and tokens with missing annotation have the default missing # value None. diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 13841dd7bbb..79a98b9bc5f 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,3 +1,7 @@ +from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union +from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config +from thinc.types import Floats2d +import numpy from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index e1c1fdc7a34..ac024ba3639 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -1,5 +1,9 @@ +from typing import Iterable, Optional, Dict, List, Callable, Any, Union +from thinc.types import Floats2d +from thinc.api import Model, Config + from itertools import islice -from typing import Any, Callable, Dict, Iterable, List, Optional, Union +from typing import Any, Callable, Dict, Iterable, List, Optional from thinc.api import Config, Model from thinc.types import Floats2d @@ -80,8 +84,6 @@ "model": DEFAULT_MULTI_TEXTCAT_MODEL, "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"}, "save_activations": False, - "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"}, - "save_activations": False, }, default_score_weights={ "cats_score": 1.0, @@ -103,9 +105,6 @@ def make_multilabel_textcat( threshold: float, scorer: Optional[Callable], save_activations: bool, -) -> "TextCategorizer": - """Create a TextCategorizer component. The text categorizer predicts categories - save_activations: bool, ) -> "TextCategorizer": """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered @@ -124,12 +123,6 @@ def make_multilabel_textcat( threshold=threshold, scorer=scorer, save_activations=save_activations, - nlp.vocab, - model, - name, - threshold=threshold, - scorer=scorer, - save_activations=save_activations, ) @@ -162,7 +155,6 @@ def __init__( threshold: float, scorer: Optional[Callable] = textcat_multilabel_score, save_activations: bool = False, - save_activations: bool = False, ) -> None: """Initialize a text categorizer for multi-label classification. @@ -171,7 +163,6 @@ def __init__( name (str): The component instance name, used to add entries to the losses during training. threshold (float): Cutoff to consider a prediction "positive". - scorer (Optional[Callable]): The scoring method. save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/textcategorizer#init @@ -184,7 +175,6 @@ def __init__( self.cfg = dict(cfg) self.scorer = scorer self.save_activations = save_activations - self.save_activations = save_activations @property def support_missing_values(self): diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx index 546a1c48abb..bd360c9501b 100644 --- a/spacy/pipeline/trainable_pipe.pyx +++ b/spacy/pipeline/trainable_pipe.pyx @@ -1,4 +1,4 @@ -# cython: infer_types=True, profile=True, binding=True +# cython: infer_types=True, binding=True from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple import srsly @@ -7,13 +7,13 @@ import warnings from ..tokens.doc cimport Doc -from ..training import validate_examples, validate_distillation_examples +from ..training import validate_examples from ..errors import Errors, Warnings from .pipe import Pipe, deserialize_config from .. import util from ..errors import Errors from ..language import Language -from ..training import Example, validate_distillation_examples, validate_examples +from ..training import Example, validate_examples from ..vocab import Vocab from .pipe import Pipe, deserialize_config @@ -59,54 +59,7 @@ cdef class TrainablePipe(Pipe): except Exception as e: error_handler(self.name, self, [doc], e) - - def distill(self, - teacher_pipe: Optional["TrainablePipe"], - examples: Iterable["Example"], - *, - drop: float=0.0, - sgd: Optional[Optimizer]=None, - losses: Optional[Dict[str, float]]=None) -> Dict[str, float]: - """Train a pipe (the student) on the predictions of another pipe - (the teacher). The student is typically trained on the probability - distribution of the teacher, but details may differ per pipe. - - teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn - from. - examples (Iterable[Example]): Distillation examples. The reference - (teacher) and predicted (student) docs must have the same number of - tokens and the same orthography. - drop (float): dropout rate. - sgd (Optional[Optimizer]): An optimizer. Will be created via - create_optimizer if not set. - losses (Optional[Dict[str, float]]): Optional record of loss during - distillation. - RETURNS: The updated losses dictionary. - - DOCS: https://spacy.io/api/pipe#distill - """ - # By default we require a teacher pipe, but there are downstream - # implementations that don't require a pipe. - if teacher_pipe is None: - raise ValueError(Errors.E4002.format(name=self.name)) - if losses is None: - losses = {} - losses.setdefault(self.name, 0.0) - validate_distillation_examples(examples, "TrainablePipe.distill") - set_dropout_rate(self.model, drop) - for node in teacher_pipe.model.walk(): - if node.name == "softmax": - node.attrs["softmax_normalize"] = True - teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples]) - student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples]) - loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores) - bp_student_scores(d_scores) - if sgd is not None: - self.finish_update(sgd) - losses[self.name] += loss - return losses - - def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: + def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are applied to the Doc. @@ -219,19 +172,6 @@ cdef class TrainablePipe(Pipe): """ raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name)) - def get_teacher_student_loss(self, teacher_scores, student_scores): - """Calculate the loss and its gradient for a batch of student - scores, relative to teacher scores. - - teacher_scores: Scores representing the teacher model's predictions. - student_scores: Scores representing the student model's predictions. - - RETURNS (Tuple[float, float]): The loss and the gradient. - - DOCS: https://spacy.io/api/pipe#get_teacher_student_loss - """ - raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name)) - def create_optimizer(self) -> Optimizer: """Create an optimizer for the pipeline component. @@ -268,14 +208,6 @@ cdef class TrainablePipe(Pipe): """ raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name)) - @property - def is_distillable(self) -> bool: - # Normally a pipe overrides `get_teacher_student_loss` to implement - # distillation. In more exceptional cases, a pipe can provide its - # own `distill` implementation. If neither of these methods is - # overridden, the pipe does not implement distillation. - return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss) - @property def is_trainable(self) -> bool: return True diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py index 0f925c0d4e1..ba2ed4e5ff3 100644 --- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py +++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py @@ -1,5 +1,5 @@ -import pickle from typing import cast +import pickle import hypothesis.strategies as st import pytest @@ -10,6 +10,7 @@ from spacy.language import Language from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees from spacy.pipeline.trainable_pipe import TrainablePipe +from spacy.training import Example from spacy.strings import StringStore from spacy.training import Example from spacy.util import make_tempdir @@ -213,53 +214,6 @@ def test_overfitting_IO(top_k): assert doc4[3].lemma_ == "egg" -def test_is_distillable(): - nlp = English() - lemmatizer = nlp.add_pipe("trainable_lemmatizer") - assert lemmatizer.is_distillable - - -def test_distill(): - teacher = English() - teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer") - teacher_lemmatizer.min_tree_freq = 1 - train_examples = [] - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1])) - - optimizer = teacher.initialize(get_examples=lambda: train_examples) - - for i in range(50): - losses = {} - teacher.update(train_examples, sgd=optimizer, losses=losses) - assert losses["trainable_lemmatizer"] < 0.00001 - - student = English() - student_lemmatizer = student.add_pipe("trainable_lemmatizer") - student_lemmatizer.min_tree_freq = 1 - student_lemmatizer.initialize( - get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data - ) - - distill_examples = [ - Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA - ] - - for i in range(50): - losses = {} - student_lemmatizer.distill( - teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses - ) - assert losses["trainable_lemmatizer"] < 0.00001 - - test_text = "She likes blue eggs" - doc = student(test_text) - assert doc[0].lemma_ == "she" - assert doc[1].lemma_ == "like" - assert doc[2].lemma_ == "blue" - assert doc[3].lemma_ == "egg" - - def test_lemmatizer_requires_labels(): nlp = English() nlp.add_pipe("trainable_lemmatizer") @@ -403,26 +357,3 @@ def test_save_activations(): ] assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO) assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,) - - -def test_save_activations(): - nlp = English() - lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer")) - lemmatizer.min_tree_freq = 1 - train_examples = [] - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - nlp.initialize(get_examples=lambda: train_examples) - nO = lemmatizer.model.get_dim("nO") - - doc = nlp("This is a test.") - assert "trainable_lemmatizer" not in doc.activations - - lemmatizer.save_activations = True - doc = nlp("This is a test.") - assert list(doc.activations["trainable_lemmatizer"].keys()) == [ - "probabilities", - "tree_ids", - ] - assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO) - assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index a3ab80f7ee0..32e7a265f37 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,18 +1,17 @@ -from typing import Any, Callable, Dict, Iterable, cast +from typing import Callable, Iterable, Dict, Any, cast import pytest from numpy.testing import assert_equal from thinc.types import Ragged -from thinc.types import Ragged from spacy import Language, registry, util from spacy.attrs import ENT_KB_ID from spacy.compat import pickle -from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase +from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates from spacy.lang.en import English from spacy.ml import load_kb -from spacy.ml.models.entity_linker import build_span_maker, get_candidates from spacy.pipeline import EntityLinker, TrainablePipe +from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.tests.util import make_tempdir @@ -454,17 +453,16 @@ def test_candidate_generation(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates - adam_ent_cands = get_candidates(mykb, adam_ent) assert len(get_candidates(mykb, douglas_ent)) == 2 - assert len(adam_ent_cands) == 1 + assert len(get_candidates(mykb, adam_ent)) == 1 assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates - assert adam_ent_cands[0].entity_id_ == "Q2" - assert adam_ent_cands[0].alias == "adam" - assert_almost_equal(adam_ent_cands[0].entity_freq, 12) - assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9) + assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" + assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" + assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) + assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) def test_el_pipe_configuration(nlp): @@ -492,7 +490,7 @@ def create_kb(vocab): assert doc[2].ent_kb_id_ == "Q2" def get_lowercased_candidates(kb, span): - return kb._get_alias_candidates(span.text.lower()) + return kb.get_alias_candidates(span.text.lower()) def get_lowercased_candidates_batch(kb, spans): return [get_lowercased_candidates(kb, span) for span in spans] @@ -551,22 +549,24 @@ def test_vocab_serialization(nlp): mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) - candidates = mykb._get_alias_candidates("adam") + candidates = mykb.get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity_id == q2_hash - assert candidates[0].entity_id_ == "Q2" - assert candidates[0].alias == "adam" + assert candidates[0].entity == q2_hash + assert candidates[0].entity_ == "Q2" + assert candidates[0].alias == adam_hash + assert candidates[0].alias_ == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1) kb_new_vocab.from_disk(d / "kb") - candidates = kb_new_vocab._get_alias_candidates("adam") + candidates = kb_new_vocab.get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity_id == q2_hash - assert candidates[0].entity_id_ == "Q2" - assert candidates[0].alias == "adam" + assert candidates[0].entity == q2_hash + assert candidates[0].entity_ == "Q2" + assert candidates[0].alias == adam_hash + assert candidates[0].alias_ == "adam" assert kb_new_vocab.get_vector("Q2") == [2] assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) @@ -586,20 +586,20 @@ def test_append_alias(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates - assert len(mykb._get_alias_candidates("douglas")) == 2 + assert len(mykb.get_alias_candidates("douglas")) == 2 # append an alias mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) # test the size of the relevant candidates has been incremented - assert len(mykb._get_alias_candidates("douglas")) == 3 + assert len(mykb.get_alias_candidates("douglas")) == 3 # append the same alias-entity pair again should not work (will throw a warning) with pytest.warns(UserWarning): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) # test the size of the relevant candidates remained unchanged - assert len(mykb._get_alias_candidates("douglas")) == 3 + assert len(mykb.get_alias_candidates("douglas")) == 3 @pytest.mark.filterwarnings("ignore:\\[W036") @@ -999,11 +999,11 @@ def test_kb_to_bytes(): assert kb_2.contains_alias("Russ Cochran") assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_alias_strings() == kb_2.get_alias_strings() - assert len(kb_1._get_alias_candidates("Russ Cochran")) == len( - kb_2._get_alias_candidates("Russ Cochran") + assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( + kb_2.get_alias_candidates("Russ Cochran") ) - assert len(kb_1._get_alias_candidates("Randomness")) == len( - kb_2._get_alias_candidates("Randomness") + assert len(kb_1.get_alias_candidates("Randomness")) == len( + kb_2.get_alias_candidates("Randomness") ) @@ -1084,6 +1084,7 @@ def test_scorer_links(): @pytest.mark.parametrize( "name,config", [ + ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ], ) @@ -1110,7 +1111,10 @@ def create_kb(vocab): return mykb entity_linker = nlp.add_pipe(name, config={"model": config}) - assert isinstance(entity_linker, EntityLinker) + if config["@architectures"] == "spacy.EntityLinker.v1": + assert isinstance(entity_linker, EntityLinker_v1) + else: + assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) optimizer = nlp.initialize(get_examples=lambda: train_examples) @@ -1293,7 +1297,6 @@ def create_kb(vocab): assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL -def test_save_activations(): def test_save_activations(): nlp = English() vector_length = 3 @@ -1309,7 +1312,7 @@ def create_kb(vocab): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher - mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length) + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 9a6bbc9fc60..c2b65977ac3 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,5 +1,4 @@ from typing import cast - import pytest from numpy.testing import assert_almost_equal, assert_equal from thinc.api import get_current_ops @@ -10,7 +9,7 @@ from spacy.language import Language from spacy.morphology import Morphology from spacy.pipeline import TrainablePipe -from spacy.tests.util import make_tempdir +from spacy.attrs import MORPH from spacy.tokens import Doc from spacy.training import Example @@ -78,12 +77,6 @@ def test_implicit_label(): nlp.initialize(get_examples=lambda: train_examples) -def test_is_distillable(): - nlp = English() - morphologizer = nlp.add_pipe("morphologizer") - assert morphologizer.is_distillable - - def test_no_resize(): nlp = Language() morphologizer = nlp.add_pipe("morphologizer") @@ -255,25 +248,3 @@ def test_save_activations(): } assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6) assert doc.activations["morphologizer"]["label_ids"].shape == (5,) - - -def test_save_activations(): - nlp = English() - morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer")) - train_examples = [] - for inst in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) - nlp.initialize(get_examples=lambda: train_examples) - - doc = nlp("This is a test.") - assert "morphologizer" not in doc.activations - - morphologizer.save_activations = True - doc = nlp("This is a test.") - assert "morphologizer" in doc.activations - assert set(doc.activations["morphologizer"].keys()) == { - "label_ids", - "probabilities", - } - assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6) - assert doc.activations["morphologizer"]["label_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 9a798eae890..2e40d86ff48 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -1,5 +1,4 @@ from typing import cast - import pytest from numpy.testing import assert_equal @@ -8,17 +7,10 @@ from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TrainablePipe -from spacy.pipeline import TrainablePipe from spacy.tests.util import make_tempdir from spacy.training import Example -def test_is_distillable(): - nlp = English() - senter = nlp.add_pipe("senter") - assert senter.is_distillable - - def test_label_types(): nlp = Language() senter = nlp.add_pipe("senter") @@ -134,26 +126,3 @@ def test_save_activations(): assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"} assert doc.activations["senter"]["probabilities"].shape == (5, nO) assert doc.activations["senter"]["label_ids"].shape == (5,) - - -def test_save_activations(): - # Test if activations are correctly added to Doc when requested. - nlp = English() - senter = cast(TrainablePipe, nlp.add_pipe("senter")) - - train_examples = [] - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - - nlp.initialize(get_examples=lambda: train_examples) - nO = senter.model.get_dim("nO") - - doc = nlp("This is a test.") - assert "senter" not in doc.activations - - senter.save_activations = True - doc = nlp("This is a test.") - assert "senter" in doc.activations - assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"} - assert doc.activations["senter"]["probabilities"].shape == (5, nO) - assert doc.activations["senter"]["label_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 05e814f0733..5deb323dd71 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,5 +1,4 @@ from typing import cast - import pytest from numpy.testing import assert_almost_equal, assert_equal from thinc.api import compounding, get_current_ops @@ -9,7 +8,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TrainablePipe -from spacy.training import Example +from thinc.api import compounding from ..util import make_tempdir @@ -25,9 +24,7 @@ def test_issue4348(): optimizer = nlp.initialize() for i in range(5): losses = {} - batches = util.minibatch( - TRAIN_DATA, size=compounding(4.0, 32.0, 1.001).to_generator() - ) + batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(batch, sgd=optimizer, losses=losses) @@ -240,52 +237,6 @@ def test_overfitting_IO(): assert doc3[0].tag_ != "N" -def test_is_distillable(): - nlp = English() - tagger = nlp.add_pipe("tagger") - assert tagger.is_distillable - - -def test_distill(): - teacher = English() - teacher_tagger = teacher.add_pipe("tagger") - train_examples = [] - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1])) - - optimizer = teacher.initialize(get_examples=lambda: train_examples) - - for i in range(50): - losses = {} - teacher.update(train_examples, sgd=optimizer, losses=losses) - assert losses["tagger"] < 0.00001 - - student = English() - student_tagger = student.add_pipe("tagger") - student_tagger.min_tree_freq = 1 - student_tagger.initialize( - get_examples=lambda: train_examples, labels=teacher_tagger.label_data - ) - - distill_examples = [ - Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA - ] - - for i in range(50): - losses = {} - student_tagger.distill( - teacher_tagger, distill_examples, sgd=optimizer, losses=losses - ) - assert losses["tagger"] < 0.00001 - - test_text = "I like blue eggs" - doc = student(test_text) - assert doc[0].tag_ == "N" - assert doc[1].tag_ == "V" - assert doc[2].tag_ == "J" - assert doc[3].tag_ == "N" - - def test_save_activations(): # Test if activations are correctly added to Doc when requested. nlp = English() diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index f834597fafe..710dac0571d 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,5 +1,5 @@ -import random from typing import cast +import random import numpy.random import pytest @@ -13,16 +13,12 @@ from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TextCategorizer, TrainablePipe -from spacy.pipeline.textcat import ( - single_label_bow_config, - single_label_cnn_config, - single_label_default_config, -) -from spacy.pipeline.textcat_multilabel import ( - multi_label_bow_config, - multi_label_cnn_config, - multi_label_default_config, -) +from spacy.pipeline.textcat import single_label_bow_config +from spacy.pipeline.textcat import single_label_cnn_config +from spacy.pipeline.textcat import single_label_default_config +from spacy.pipeline.textcat_multilabel import multi_label_bow_config +from spacy.pipeline.textcat_multilabel import multi_label_cnn_config +from spacy.pipeline.textcat_multilabel import multi_label_default_config from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.tokens import Doc, DocBin @@ -104,9 +100,7 @@ def test_issue3611(): optimizer = nlp.initialize() for i in range(3): losses = {} - batches = util.minibatch( - train_data, size=compounding(4.0, 32.0, 1.001).to_generator() - ) + batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) @@ -143,9 +137,7 @@ def test_issue4030(): optimizer = nlp.initialize() for i in range(3): losses = {} - batches = util.minibatch( - train_data, size=compounding(4.0, 32.0, 1.001).to_generator() - ) + batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) @@ -304,7 +296,6 @@ def test_issue9904(): examples = get_examples() scores = textcat.predict([eg.predicted for eg in examples])["probabilities"] - scores = textcat.predict([eg.predicted for eg in examples])["probabilities"] loss = textcat.get_loss(examples, scores)[0] loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0] @@ -474,8 +465,6 @@ def test_no_resize(name, textcat_config): # CNN ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), - ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), - ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), ], ) # fmt: on @@ -613,12 +602,6 @@ def test_initialize_examples(name, get_examples, train_data): nlp.initialize(get_examples=get_examples()) -def test_is_distillable(): - nlp = English() - textcat = nlp.add_pipe("textcat") - assert not textcat.is_distillable - - def test_overfitting_IO(): # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly fix_random_seed(0) @@ -963,11 +946,9 @@ def test_textcat_multi_threshold(): assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 -def test_save_activations(): def test_save_activations(): nlp = English() textcat = cast(TrainablePipe, nlp.add_pipe("textcat")) - textcat = cast(TrainablePipe, nlp.add_pipe("textcat")) train_examples = [] for text, annotations in TRAIN_DATA_SINGLE_LABEL: @@ -984,34 +965,6 @@ def test_save_activations(): assert doc.activations["textcat"]["probabilities"].shape == (nO,) -def test_save_activations_multi(): - nlp = English() - textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel")) - - train_examples = [] - for text, annotations in TRAIN_DATA_MULTI_LABEL: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - nlp.initialize(get_examples=lambda: train_examples) - nO = textcat.model.get_dim("nO") - - doc = nlp("This is a test.") - assert "textcat_multilabel" not in doc.activations - - textcat.save_activations = True - doc = nlp("This is a test.") - assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"] - assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,) - nO = textcat.model.get_dim("nO") - - doc = nlp("This is a test.") - assert "textcat" not in doc.activations - - textcat.save_activations = True - doc = nlp("This is a test.") - assert list(doc.activations["textcat"].keys()) == ["probabilities"] - assert doc.activations["textcat"]["probabilities"].shape == (nO,) - - def test_save_activations_multi(): nlp = English() textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel")) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 9fb6a72c87f..fc0404f1423 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -50,6 +50,8 @@ cdef class Doc: cdef public dict activations + cdef public dict activations + cdef public dict user_hooks cdef public dict user_token_hooks cdef public dict user_span_hooks diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index dc7c0143029..5fda6f2f789 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -8,6 +8,7 @@ from typing import ( List, Optional, Protocol, + Sequence, Tuple, Union, overload, @@ -16,15 +17,20 @@ from typing import ( import numpy as np from cymem.cymem import Pool from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged - +from .span import Span +from .token import Token +from .span_groups import SpanGroups +from .retokenizer import Retokenizer from ..lexeme import Lexeme from ..vocab import Vocab -from .retokenizer import Retokenizer +from ._dict_proxies import SpanGroups +from ._retokenize import Retokenizer from .span import Span -from .span_groups import SpanGroups from .token import Token from .underscore import Underscore +DOCBIN_ALL_ATTRS: Tuple[str, ...] + class DocMethod(Protocol): def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] @@ -34,6 +40,7 @@ class Doc: spans: SpanGroups max_length: int length: int + sentiment: float activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]] cats: Dict[str, float] user_hooks: Dict[str, Callable[..., Any]] @@ -118,7 +125,6 @@ class Doc: start_idx: int, end_idx: int, label: Union[int, str] = ..., - *, kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., @@ -146,12 +152,12 @@ class Doc: blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., - default: str = ..., + default: str = ... ) -> None: ... @property - def noun_chunks(self) -> Tuple[Span]: ... + def noun_chunks(self) -> Iterator[Span]: ... @property - def sents(self) -> Tuple[Span]: ... + def sents(self) -> Iterator[Span]: ... @property def lang(self) -> int: ... @property diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index e92c0e833e0..310ce0dc88d 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -214,7 +214,6 @@ alignment mode `"strict". | `start` | The index of the first character of the span. ~~int~~ | | `end` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | -| _keyword-only_ | | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | @@ -654,10 +653,11 @@ the [`TextCategorizer`](/api/textcategorizer). ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"} -Returns a tuple of the base noun phrases in the doc, if the document has been -syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that -does not permit other NPs to be nested within it – so no NP-level coordination, -no prepositional phrases, and no relative clauses. +Iterate over the base noun phrases in the document. Yields base noun-phrase +`Span` objects, if the document has been syntactically parsed. A base noun +phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be +nested within it – so no NP-level coordination, no prepositional phrases, and no +relative clauses. To customize the noun chunk iterator in a loaded pipeline, modify [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` @@ -674,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised. > assert chunks[1].text == "another phrase" > ``` -| Name | Description | -| ----------- | -------------------------------------------- | -| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ | +| Name | Description | +| ---------- | ------------------------------------- | +| **YIELDS** | Noun chunks in the document. ~~Span~~ | ## Doc.sents {id="sents",tag="property",model="sentences"} -Returns a tuple of the sentences in the document. Sentence spans have no label. +Iterate over the sentences in the document. Sentence spans have no label. This property is only available when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the @@ -696,9 +696,9 @@ will raise an error otherwise. > assert [s.root.text for s in sents] == ["is", "'s"] > ``` -| Name | Description | -| ----------- | ------------------------------------------ | -| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ | +| Name | Description | +| ---------- | ----------------------------------- | +| **YIELDS** | Sentences in the document. ~~Span~~ | ## Doc.has_vector {id="has_vector",tag="property",model="vectors"} @@ -762,6 +762,7 @@ The L2 norm of the document's vector representation. | `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | | `lang` 2.1 | Language of the document's vocabulary. ~~int~~ | | `lang_` 2.1 | Language of the document's vocabulary. ~~str~~ | +| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ | | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | @@ -785,6 +786,7 @@ serialization by passing in the string names via the `exclude` argument. | Name | Description | | ------------------ | --------------------------------------------- | | `text` | The value of the `Doc.text` attribute. | +| `sentiment` | The value of the `Doc.sentiment` attribute. | | `tensor` | The value of the `Doc.tensor` attribute. | | `user_data` | The value of the `Doc.user_data` dictionary. | | `user_data_keys` | The keys of the `Doc.user_data` dictionary. | diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index f4b83d88bbf..fe720a60dd1 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -53,21 +53,20 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | -| `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| Setting | Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx index 9514bc773b9..1fda807cb32 100644 --- a/website/docs/api/morphologizer.mdx +++ b/website/docs/api/morphologizer.mdx @@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | | `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | @@ -121,39 +121,6 @@ delegate to the [`predict`](/api/morphologizer#predict) and | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | -## Morphologizer.distill {id="distill", tag="method,experimental", version="4"} - -Train a pipe (the student) on the predictions of another pipe (the teacher). The -student is typically trained on the probability distribution of the teacher, but -details may differ per pipe. The goal of distillation is to transfer knowledge -from the teacher to the student. - -The distillation is performed on ~~Example~~ objects. The `Example.reference` -and `Example.predicted` ~~Doc~~s must have the same number of tokens and the -same orthography. Even though the reference does not need have to have gold -annotations, the teacher could adds its own annotations when necessary. - -This feature is experimental. - -> #### Example -> -> ```python -> teacher_pipe = teacher.add_pipe("morphologizer") -> student_pipe = student.add_pipe("morphologizer") -> optimizer = nlp.resume_training() -> losses = student.distill(teacher_pipe, examples, sgd=optimizer) -> ``` - -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ | -| `examples` | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | Dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | - ## Morphologizer.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood @@ -292,27 +259,6 @@ predicted scores. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## Morphologizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"} - -Calculate the loss and its gradient for the batch of student scores relative to -the teacher scores. - -> #### Example -> -> ```python -> teacher_morphologizer = teacher.get_pipe("morphologizer") -> student_morphologizer = student.add_pipe("morphologizer") -> student_scores = student_morphologizer.predict([eg.predicted for eg in examples]) -> teacher_scores = teacher_morphologizer.predict([eg.predicted for eg in examples]) -> loss, d_loss = student_morphologizer.get_teacher_student_loss(teacher_scores, student_scores) -> ``` - -| Name | Description | -| ---------------- | --------------------------------------------------------------------------- | -| `teacher_scores` | Scores representing the teacher model's predictions. | -| `student_scores` | Scores representing the student model's predictions. | -| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | - ## Morphologizer.create_optimizer {id="create_optimizer",tag="method"} Create an optimizer for the pipeline component.