diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml index 910cfdc40ff..78a27cfa3ba 100644 --- a/.github/workflows/explosionbot.yml +++ b/.github/workflows/explosionbot.yml @@ -15,7 +15,7 @@ jobs: env: GITHUB_CONTEXT: ${{ toJson(github) }} run: echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 - name: Install and run explosion-bot run: | diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml index c6ea98f76c7..66e0707e0d5 100644 --- a/.github/workflows/gputests.yml +++ b/.github/workflows/gputests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [master, main] + branch: [master, v4] if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml index 6c3985a930a..2bbdd64c771 100644 --- a/.github/workflows/lock.yml +++ b/.github/workflows/lock.yml @@ -16,7 +16,7 @@ jobs: if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - uses: dessant/lock-threads@v4 + - uses: dessant/lock-threads@v5 with: process-only: 'issues' issue-inactive-days: '30' diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index 4a4f0800590..17d8989faa8 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -9,12 +9,12 @@ jobs: strategy: fail-fast: false matrix: - branch: [master, main] + branch: [master, v4] if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: ref: ${{ matrix.branch }} - name: Get commits from past 24 hours diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index 33851fbcc18..01731ffe0d7 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -18,7 +18,7 @@ jobs: run: | echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: '3.10' diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c44009dd48f..53a0558c6f1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,13 +25,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Configure Python version uses: actions/setup-python@v4 with: python-version: "3.9" - architecture: x64 - name: black run: | @@ -71,13 +70,12 @@ jobs: steps: - name: Check out repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Configure Python version uses: actions/setup-python@v4 with: python-version: ${{ matrix.python_version }} - architecture: x64 - name: Install dependencies run: | diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml index a4ce633dde6..144a03c637e 100644 --- a/.github/workflows/universe_validation.yml +++ b/.github/workflows/universe_validation.yml @@ -20,13 +20,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Configure Python version uses: actions/setup-python@v4 with: python-version: "3.9" - architecture: x64 - name: Validate website/meta/universe.json run: | diff --git a/LICENSE b/LICENSE index 979f5ade7b4..6cb7810c6ee 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal +Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/requirements.txt b/requirements.txt index 21293eb2829..9c78b33d940 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,9 +9,8 @@ murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 -typer>=0.3.0,<0.10.0 -smart-open>=5.2.1,<7.0.0 -weasel>=0.1.0,<0.4.0 +typer>=0.3.0,<1.0.0 +weasel>=0.1.0,<0.5.0 # Third party dependencies numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" diff --git a/setup.cfg b/setup.cfg index 7d87a382902..e46bd4d5a33 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,10 +41,9 @@ install_requires = wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 - weasel>=0.1.0,<0.4.0 + weasel>=0.1.0,<0.5.0 # Third-party dependencies - typer>=0.3.0,<0.10.0 - smart-open>=5.2.1,<7.0.0 + typer>=0.3.0,<1.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index d518444075e..cc93c34db3b 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,5 +1,7 @@ from wasabi import msg +# Needed for testing +from . import download as download_module # noqa: F401 from ._util import app, setup_cli # noqa: F401 from .apply import apply # noqa: F401 from .assemble import assemble_cli # noqa: F401 diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 2ccde261377..fe8c310e0bf 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,5 +1,6 @@ import sys from typing import Optional, Sequence +from urllib.parse import urljoin import requests import typer @@ -64,6 +65,13 @@ def download( ) pip_args = pip_args + ("--no-deps",) if direct: + # Reject model names with '/', in order to prevent shenanigans. + if "/" in model: + msg.fail( + title="Model download rejected", + text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments", + exits=True, + ) components = model.split("-") model_name = "".join(components[:-1]) version = components[-1] @@ -156,7 +164,16 @@ def get_latest_version(model: str) -> str: def download_model( filename: str, user_pip_args: Optional[Sequence[str]] = None ) -> None: - download_url = about.__download_url__ + "/" + filename + # Construct the download URL carefully. We need to make sure we don't + # allow relative paths or other shenanigans to trick us into download + # from outside our own repo. + base_url = about.__download_url__ + # urljoin requires that the path ends with /, or the last path part will be dropped + if not base_url.endswith("/"): + base_url = about.__download_url__ + "/" + download_url = urljoin(base_url, filename) + if not download_url.startswith(about.__download_url__): + raise ValueError(f"Download from {filename} rejected. Was it a relative path?") pip_args = list(user_pip_args) if user_pip_args is not None else [] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] run_command(cmd) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 48077fa511d..3e86495e7c1 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -39,7 +39,7 @@ def find_threshold_cli( # fmt: on ): """ - Runs prediction trials for a trained model with varying tresholds to maximize + Runs prediction trials for a trained model with varying thresholds to maximize the specified metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` @@ -81,7 +81,7 @@ def find_threshold( silent: bool = True, ) -> Tuple[float, float, Dict[float, float]]: """ - Runs prediction trials for models with varying tresholds to maximize the specified metric. + Runs prediction trials for models with varying thresholds to maximize the specified metric. model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory. data_path (Path): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. diff --git a/spacy/errors.py b/spacy/errors.py index 98ddbe9f62b..65375888816 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -215,6 +215,7 @@ class Warnings(metaclass=ErrorsWithCodes): "key attribute for vectors, configure it through Vectors(attr=) or " "'spacy init vectors --attr'") W126 = ("These keys are unsupported: {unsupported}") + W127 = ("Not all `Language.pipe` worker processes completed successfully") # v4 warning strings W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " diff --git a/spacy/language.py b/spacy/language.py index 59d25586b30..204508dc455 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1844,6 +1844,9 @@ def prepare_input( for proc in procs: proc.join() + if not all(proc.exitcode == 0 for proc in procs): + warnings.warn(Warnings.W127) + def _link_components(self) -> None: """Register 'listeners' within pipeline components, to allow them to effectively share weights. @@ -2467,6 +2470,7 @@ def _apply_pipes( if isinstance(texts_with_ctx, _WorkDoneSentinel): sender.close() receiver.close() + return docs = ( ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx @@ -2492,6 +2496,7 @@ def _apply_pipes( # stop processing. sender.close() receiver.close() + return class _Sender: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 41fc8f1d2b1..b0c3784d86e 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -164,32 +164,34 @@ cdef class Lexeme: vector = self.vector return numpy.sqrt((vector**2).sum()) - property vector: + @property + def vector(self): """A real-valued meaning representation. RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array representing the lexeme's semantics. """ - def __get__(self): - cdef int length = self.vocab.vectors_length - if length == 0: - raise ValueError(Errors.E010) - return self.vocab.get_vector(self.c.orth) - - def __set__(self, vector): - if len(vector) != self.vocab.vectors_length: - raise ValueError(Errors.E073.format(new_length=len(vector), - length=self.vocab.vectors_length)) - self.vocab.set_vector(self.c.orth, vector) - - property rank: + cdef int length = self.vocab.vectors_length + if length == 0: + raise ValueError(Errors.E010) + return self.vocab.get_vector(self.c.orth) + + @vector.setter + def vector(self, vector): + if len(vector) != self.vocab.vectors_length: + raise ValueError(Errors.E073.format(new_length=len(vector), + length=self.vocab.vectors_length)) + self.vocab.set_vector(self.c.orth, vector) + + @property + def rank(self): """RETURNS (str): Sequential ID of the lexeme's lexical type, used to index into tables, e.g. for word vectors.""" - def __get__(self): - return self.c.id + return self.c.id - def __set__(self, value): - self.c.id = value + @rank.setter + def rank(self, value): + self.c.id = value @property def orth_(self): @@ -203,306 +205,338 @@ cdef class Lexeme: """RETURNS (str): The original verbatim text of the lexeme.""" return self.orth_ - property lower: + @property + def lower(self): """RETURNS (uint64): Lowercase form of the lexeme.""" - def __get__(self): - return self.c.lower + return self.c.lower - def __set__(self, attr_t x): - self.c.lower = x + @lower.setter + def lower(self, attr_t x): + self.c.lower = x - property norm: + @property + def norm(self): """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the lexeme text. """ - def __get__(self): - return self.c.norm + return self.c.norm - def __set__(self, attr_t x): - if "lexeme_norm" not in self.vocab.lookups: - self.vocab.lookups.add_table("lexeme_norm") - norm_table = self.vocab.lookups.get_table("lexeme_norm") - norm_table[self.c.orth] = self.vocab.strings[x] - self.c.norm = x + @norm.setter + def norm(self, attr_t x): + if "lexeme_norm" not in self.vocab.lookups: + self.vocab.lookups.add_table("lexeme_norm") + norm_table = self.vocab.lookups.get_table("lexeme_norm") + norm_table[self.c.orth] = self.vocab.strings[x] + self.c.norm = x - property shape: + @property + def shape(self): """RETURNS (uint64): Transform of the word's string, to show orthographic features. """ - def __get__(self): - return self.c.shape + return self.c.shape - def __set__(self, attr_t x): - self.c.shape = x + @shape.setter + def shape(self, attr_t x): + self.c.shape = x - property prefix: + @property + def prefix(self): """RETURNS (uint64): Length-N substring from the start of the word. Defaults to `N=1`. """ - def __get__(self): - return self.c.prefix + return self.c.prefix - def __set__(self, attr_t x): - self.c.prefix = x + @prefix.setter + def prefix(self, attr_t x): + self.c.prefix = x - property suffix: + @property + def suffix(self): """RETURNS (uint64): Length-N substring from the end of the word. Defaults to `N=3`. """ - def __get__(self): - return self.c.suffix + return self.c.suffix - def __set__(self, attr_t x): - self.c.suffix = x + @suffix.setter + def suffix(self, attr_t x): + self.c.suffix = x - property cluster: + @property + def cluster(self): """RETURNS (int): Brown cluster ID.""" - def __get__(self): - cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) - return cluster_table.get(self.c.orth, 0) + cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) + return cluster_table.get(self.c.orth, 0) - def __set__(self, int x): - cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) - cluster_table[self.c.orth] = x + @cluster.setter + def cluster(self, int x): + cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) + cluster_table[self.c.orth] = x - property lang: + @property + def lang(self): """RETURNS (uint64): Language of the parent vocabulary.""" - def __get__(self): - return self.c.lang + return self.c.lang - def __set__(self, attr_t x): - self.c.lang = x + @lang.setter + def lang(self, attr_t x): + self.c.lang = x - property prob: + @property + def prob(self): """RETURNS (float): Smoothed log probability estimate of the lexeme's type.""" - def __get__(self): - prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) - settings_table = self.vocab.lookups.get_table("lexeme_settings", {}) - default_oov_prob = settings_table.get("oov_prob", -20.0) - return prob_table.get(self.c.orth, default_oov_prob) + prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) + settings_table = self.vocab.lookups.get_table("lexeme_settings", {}) + default_oov_prob = settings_table.get("oov_prob", -20.0) + return prob_table.get(self.c.orth, default_oov_prob) - def __set__(self, float x): - prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) - prob_table[self.c.orth] = x + @prob.setter + def prob(self, float x): + prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) + prob_table[self.c.orth] = x - property lower_: + @property + def lower_(self): """RETURNS (str): Lowercase form of the word.""" - def __get__(self): - return self.vocab.strings[self.c.lower] + return self.vocab.strings[self.c.lower] - def __set__(self, str x): - self.c.lower = self.vocab.strings.add(x) + @lower_.setter + def lower_(self, str x): + self.c.lower = self.vocab.strings.add(x) - property norm_: + @property + def norm_(self): """RETURNS (str): The lexeme's norm, i.e. a normalised form of the lexeme text. """ - def __get__(self): - return self.vocab.strings[self.c.norm] + return self.vocab.strings[self.c.norm] - def __set__(self, str x): - self.norm = self.vocab.strings.add(x) + @norm_.setter + def norm_(self, str x): + self.norm = self.vocab.strings.add(x) - property shape_: + @property + def shape_(self): """RETURNS (str): Transform of the word's string, to show orthographic features. """ - def __get__(self): - return self.vocab.strings[self.c.shape] + return self.vocab.strings[self.c.shape] - def __set__(self, str x): - self.c.shape = self.vocab.strings.add(x) + @shape_.setter + def shape_(self, str x): + self.c.shape = self.vocab.strings.add(x) - property prefix_: + @property + def prefix_(self): """RETURNS (str): Length-N substring from the start of the word. Defaults to `N=1`. """ - def __get__(self): - return self.vocab.strings[self.c.prefix] + return self.vocab.strings[self.c.prefix] - def __set__(self, str x): - self.c.prefix = self.vocab.strings.add(x) + @prefix_.setter + def prefix_(self, str x): + self.c.prefix = self.vocab.strings.add(x) - property suffix_: + @property + def suffix_(self): """RETURNS (str): Length-N substring from the end of the word. Defaults to `N=3`. """ - def __get__(self): - return self.vocab.strings[self.c.suffix] + return self.vocab.strings[self.c.suffix] - def __set__(self, str x): - self.c.suffix = self.vocab.strings.add(x) + @suffix_.setter + def suffix_(self, str x): + self.c.suffix = self.vocab.strings.add(x) - property lang_: + @property + def lang_(self): """RETURNS (str): Language of the parent vocabulary.""" - def __get__(self): - return self.vocab.strings[self.c.lang] + return self.vocab.strings[self.c.lang] - def __set__(self, str x): - self.c.lang = self.vocab.strings.add(x) + @lang_.setter + def lang_(self, str x): + self.c.lang = self.vocab.strings.add(x) - property flags: + @property + def flags(self): """RETURNS (uint64): Container of the lexeme's binary flags.""" - def __get__(self): - return self.c.flags + return self.c.flags - def __set__(self, flags_t x): - self.c.flags = x + @flags.setter + def flags(self, flags_t x): + self.c.flags = x @property def is_oov(self): """RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" return self.orth not in self.vocab.vectors - property is_stop: + @property + def is_stop(self): """RETURNS (bool): Whether the lexeme is a stop word.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_STOP) + return Lexeme.c_check_flag(self.c, IS_STOP) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_STOP, x) + @is_stop.setter + def is_stop(self, bint x): + Lexeme.c_set_flag(self.c, IS_STOP, x) - property is_alpha: + @property + def is_alpha(self): """RETURNS (bool): Whether the lexeme consists of alphabetic characters. Equivalent to `lexeme.text.isalpha()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_ALPHA) + return Lexeme.c_check_flag(self.c, IS_ALPHA) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_ALPHA, x) + @is_alpha.setter + def is_alpha(self, bint x): + Lexeme.c_set_flag(self.c, IS_ALPHA, x) - property is_ascii: + @property + def is_ascii(self): """RETURNS (bool): Whether the lexeme consists of ASCII characters. Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_ASCII) + return Lexeme.c_check_flag(self.c, IS_ASCII) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_ASCII, x) + @is_ascii.setter + def is_ascii(self, bint x): + Lexeme.c_set_flag(self.c, IS_ASCII, x) - property is_digit: + @property + def is_digit(self): """RETURNS (bool): Whether the lexeme consists of digits. Equivalent to `lexeme.text.isdigit()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_DIGIT) + return Lexeme.c_check_flag(self.c, IS_DIGIT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_DIGIT, x) + @is_digit.setter + def is_digit(self, bint x): + Lexeme.c_set_flag(self.c, IS_DIGIT, x) - property is_lower: + @property + def is_lower(self): """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to `lexeme.text.islower()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_LOWER) + return Lexeme.c_check_flag(self.c, IS_LOWER) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_LOWER, x) + @is_lower.setter + def is_lower(self, bint x): + Lexeme.c_set_flag(self.c, IS_LOWER, x) - property is_upper: + @property + def is_upper(self): """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to `lexeme.text.isupper()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_UPPER) + return Lexeme.c_check_flag(self.c, IS_UPPER) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_UPPER, x) + @is_upper.setter + def is_upper(self, bint x): + Lexeme.c_set_flag(self.c, IS_UPPER, x) - property is_title: + @property + def is_title(self): """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to `lexeme.text.istitle()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_TITLE) + return Lexeme.c_check_flag(self.c, IS_TITLE) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_TITLE, x) + @is_title.setter + def is_title(self, bint x): + Lexeme.c_set_flag(self.c, IS_TITLE, x) - property is_punct: + @property + def is_punct(self): """RETURNS (bool): Whether the lexeme is punctuation.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_PUNCT) + return Lexeme.c_check_flag(self.c, IS_PUNCT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_PUNCT, x) + @is_punct.setter + def is_punct(self, bint x): + Lexeme.c_set_flag(self.c, IS_PUNCT, x) - property is_space: + @property + def is_space(self): """RETURNS (bool): Whether the lexeme consist of whitespace characters. Equivalent to `lexeme.text.isspace()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_SPACE) + return Lexeme.c_check_flag(self.c, IS_SPACE) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_SPACE, x) + @is_space.setter + def is_space(self, bint x): + Lexeme.c_set_flag(self.c, IS_SPACE, x) - property is_bracket: + @property + def is_bracket(self): """RETURNS (bool): Whether the lexeme is a bracket.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_BRACKET) + return Lexeme.c_check_flag(self.c, IS_BRACKET) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_BRACKET, x) + @is_bracket.setter + def is_bracket(self, bint x): + Lexeme.c_set_flag(self.c, IS_BRACKET, x) - property is_quote: + @property + def is_quote(self): """RETURNS (bool): Whether the lexeme is a quotation mark.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_QUOTE) + return Lexeme.c_check_flag(self.c, IS_QUOTE) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_QUOTE, x) + @is_quote.setter + def is_quote(self, bint x): + Lexeme.c_set_flag(self.c, IS_QUOTE, x) - property is_left_punct: + @property + def is_left_punct(self): """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) + return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) + @is_left_punct.setter + def is_left_punct(self, bint x): + Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) - property is_right_punct: + @property + def is_right_punct(self): """RETURNS (bool): Whether the lexeme is right punctuation, e.g. ).""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) + return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) + @is_right_punct.setter + def is_right_punct(self, bint x): + Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) - property is_currency: + @property + def is_currency(self): """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_CURRENCY) + return Lexeme.c_check_flag(self.c, IS_CURRENCY) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_CURRENCY, x) + @is_currency.setter + def is_currency(self, bint x): + Lexeme.c_set_flag(self.c, IS_CURRENCY, x) - property like_url: + @property + def like_url(self): """RETURNS (bool): Whether the lexeme resembles a URL.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, LIKE_URL) + return Lexeme.c_check_flag(self.c, LIKE_URL) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, LIKE_URL, x) + @like_url.setter + def like_url(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_URL, x) - property like_num: + @property + def like_num(self): """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9", "10", "ten", etc. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, LIKE_NUM) + return Lexeme.c_check_flag(self.c, LIKE_NUM) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, LIKE_NUM, x) + @like_num.setter + def like_num(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_NUM, x) - property like_email: + @property + def like_email(self): """RETURNS (bool): Whether the lexeme resembles an email address.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, LIKE_EMAIL) + return Lexeme.c_check_flag(self.c, LIKE_EMAIL) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) + @like_email.setter + def like_email(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 2d6baa1bc7c..b0a6d78a6f0 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -169,6 +169,23 @@ def build_text_classifier_v2( model.set_ref("output_layer", linear_model.get_ref("output_layer")) model.attrs["multi_label"] = not exclusive_classes + model.init = init_ensemble_textcat # type: ignore[assignment] + return model + + +def init_ensemble_textcat(model, X, Y) -> Model: + # When tok2vec is lazily initialized, we need to initialize it before + # the rest of the chain to ensure that we can get its width. + tok2vec = model.get_ref("tok2vec") + tok2vec.initialize(X) + + tok2vec_width = get_tok2vec_width(model) + model.get_ref("attention_layer").set_dim("nO", tok2vec_width) + model.get_ref("maxout_layer").set_dim("nO", tok2vec_width) + model.get_ref("maxout_layer").set_dim("nI", tok2vec_width) + model.get_ref("norm_layer").set_dim("nI", tok2vec_width) + model.get_ref("norm_layer").set_dim("nO", tok2vec_width) + init_chain(model, X, Y) return model @@ -241,6 +258,7 @@ def _build_parametric_attention_with_residual_nonlinear( parametric_attention.set_ref("tok2vec", tok2vec) parametric_attention.set_ref("attention_layer", attention_layer) + parametric_attention.set_ref("key_transform", key_transform) parametric_attention.set_ref("nonlinear_layer", nonlinear_layer) parametric_attention.set_ref("norm_layer", norm_layer) @@ -248,10 +266,17 @@ def _build_parametric_attention_with_residual_nonlinear( def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model: + # When tok2vec is lazily initialized, we need to initialize it before + # the rest of the chain to ensure that we can get its width. + tok2vec = model.get_ref("tok2vec") + tok2vec.initialize(X) + tok2vec_width = get_tok2vec_width(model) model.get_ref("attention_layer").set_dim("nO", tok2vec_width) - model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width) + model.get_ref("key_transform").set_dim("nI", tok2vec_width) + model.get_ref("key_transform").set_dim("nO", tok2vec_width) model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width) + model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width) model.get_ref("norm_layer").set_dim("nI", tok2vec_width) model.get_ref("norm_layer").set_dim("nO", tok2vec_width) init_chain(model, X, Y) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index ecad6484ef5..6c6b5daa8e7 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -24,7 +24,6 @@ from ..errors import Errors, Warnings from ..kb import Candidate, KnowledgeBase from ..language import Language -from ..ml import empty_kb from ..scorer import Scorer from ..tokens import Doc, Span, SpanGroup from ..training import Example, validate_examples, validate_get_examples @@ -114,7 +113,7 @@ def make_entity_linker( documents with textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. - use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another + use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another component must provide entity annotations. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. @@ -227,7 +226,6 @@ def __init__( self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) self.kb = generate_empty_kb(self.vocab, entity_vector_length) - self.scorer = scorer self.use_gold_ents = use_gold_ents self.threshold = threshold self.save_activations = save_activations @@ -235,6 +233,37 @@ def __init__( if self.incl_prior and not self.kb.supports_prior_probs: warnings.warn(Warnings.W401) + def _score_with_ents_set(examples: Iterable[Example], **kwargs): + # Because of how spaCy works, we can't just score immediately, because Language.evaluate + # calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline. + if not scorer: + return scorer + if not self.use_gold_ents: + return scorer(examples, **kwargs) + else: + examples = self._ensure_ents(examples) + docs = self.pipe( + (eg.predicted for eg in examples), + ) + for eg, doc in zip(examples, docs): + eg.predicted = doc + return scorer(examples, **kwargs) + + self.scorer = _score_with_ents_set + + def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]: + """If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted.""" + if not self.use_gold_ents: + return examples + + new_examples = [] + for eg in examples: + ents, _ = eg.get_aligned_ents_and_ner() + new_eg = eg.copy() + new_eg.predicted.ents = ents + new_examples.append(new_eg) + return new_examples + def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will create it using this object's vocab.""" @@ -276,11 +305,9 @@ def initialize( nO = self.kb.entity_vector_length doc_sample = [] vector_sample = [] - for eg in islice(get_examples(), 10): + examples = self._ensure_ents(islice(get_examples(), 10)) + for eg in examples: doc = eg.x - if self.use_gold_ents: - ents, _ = eg.get_aligned_ents_and_ner() - doc.ents = ents doc_sample.append(doc) vector_sample.append(self.model.ops.alloc1f(nO)) assert len(doc_sample) > 0, Errors.E923.format(name=self.name) @@ -347,31 +374,17 @@ def update( losses.setdefault(self.name, 0.0) if not examples: return losses + examples = self._ensure_ents(examples) validate_examples(examples, "EntityLinker.update") - set_dropout_rate(self.model, drop) - docs = [eg.predicted for eg in examples] - # save to restore later - old_ents = [doc.ents for doc in docs] - - for doc, ex in zip(docs, examples): - if self.use_gold_ents: - ents, _ = ex.get_aligned_ents_and_ner() - doc.ents = ents - else: - # only keep matching ents - doc.ents = ex.get_matching_ents() - # make sure we have something to learn from, if not, short-circuit if not self.batch_has_learnable_example(examples): return losses + set_dropout_rate(self.model, drop) + docs = [eg.predicted for eg in examples] sentence_encodings, bp_context = self.model.begin_update(docs) - # now restore the ents - for doc, old in zip(docs, old_ents): - doc.ents = old - loss, d_scores = self.get_loss( sentence_encodings=sentence_encodings, examples=examples ) @@ -379,11 +392,13 @@ def update( if sgd is not None: self.finish_update(sgd) losses[self.name] += loss + return losses def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): validate_examples(examples, "EntityLinker.get_loss") entity_encodings = [] + # We assume that get_loss is called with gold ents set in the examples if need be eidx = 0 # indices in gold entities to keep keep_ents = [] # indices in sentence_encodings to keep diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 493cbe1fa63..d7a39e4b450 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -799,7 +799,7 @@ def test_preserving_links_ents_2(nlp): # fmt: on -def test_overfitting_IO(): +def test_overfitting_IO_gold_entities(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() vector_length = 3 @@ -826,7 +826,9 @@ def create_kb(vocab): return mykb # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker = nlp.add_pipe( + "entity_linker", last=True, config={"use_gold_ents": True} + ) assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) assert "Q2146908" in entity_linker.vocab.strings @@ -889,6 +891,107 @@ def create_kb(vocab): assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + eval = nlp.evaluate(train_examples) + assert "nel_macro_p" in eval + assert "nel_macro_r" in eval + assert "nel_macro_f" in eval + assert "nel_micro_p" in eval + assert "nel_micro_r" in eval + assert "nel_micro_f" in eval + assert "nel_f_per_type" in eval + assert "PERSON" in eval["nel_f_per_type"] + + assert eval["nel_macro_f"] > 0 + assert eval["nel_micro_f"] > 0 + + +def test_overfitting_IO_with_ner(): + # Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly + nlp = English() + vector_length = 3 + assert "Q2146908" not in nlp.vocab.strings + + # Convert the texts to docs to make sure we have doc.ents set for the training examples + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB - assign same prior weight to the two russ cochran's + # Q2146908 (Russ Cochran): American golfer + # Q7381115 (Russ Cochran): publisher + mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) + return mykb + + # Create the NER and EL components and add them to the pipeline + ner = nlp.add_pipe("ner", first=True) + entity_linker = nlp.add_pipe( + "entity_linker", last=True, config={"use_gold_ents": False} + ) + entity_linker.set_kb(create_kb) + + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for ent in annotations.get("entities"): + ner.add_label(ent[2]) + optimizer = nlp.initialize() + + # train the NER and NEL pipes + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + assert losses["ner"] < 0.001 + assert losses["entity_linker"] < 0.001 + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + + # test the trained model + test_text = "Russ Cochran captured his first major title with his son as caddie." + doc = nlp(test_text) + ents = doc.ents + assert len(ents) == 1 + assert ents[0].text == "Russ Cochran" + assert ents[0].label_ == "PERSON" + assert ents[0].kb_id_ != "NIL" + + # TODO: below assert is still flaky - EL doesn't properly overfit quite yet + # assert ents[0].kb_id_ == "Q2146908" + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + assert nlp2.pipe_names == nlp.pipe_names + doc2 = nlp2(test_text) + ents2 = doc2.ents + assert len(ents2) == 1 + assert ents2[0].text == "Russ Cochran" + assert ents2[0].label_ == "PERSON" + assert ents2[0].kb_id_ != "NIL" + + eval = nlp.evaluate(train_examples) + assert "nel_macro_f" in eval + assert "nel_micro_f" in eval + assert "ents_f" in eval + assert "nel_f_per_type" in eval + assert "ents_per_type" in eval + assert "PERSON" in eval["nel_f_per_type"] + assert "PERSON" in eval["ents_per_type"] + + assert eval["nel_macro_f"] > 0 + assert eval["nel_micro_f"] > 0 + assert eval["ents_f"] > 0 + def test_kb_serialization(): # Test that the KB can be used in a pipeline with a different vocab diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 35cec61f132..43c5257e591 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -29,6 +29,8 @@ from spacy.training import Example from spacy.training.initialize import init_nlp +# Ensure that the architecture gets added to the registry. +from ..tok2vec import build_lazy_init_tok2vec as _ from ..util import make_tempdir TRAIN_DATA_SINGLE_LABEL = [ @@ -41,6 +43,13 @@ ("I'm confused but happy", {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}), ] +lazy_init_model_config = """ +[model] +@architectures = "test.LazyInitTok2Vec.v1" +width = 96 +""" +LAZY_INIT_TOK2VEC_MODEL = Config().from_str(lazy_init_model_config)["model"] + def make_get_examples_single_label(nlp): train_examples = [] @@ -551,6 +560,34 @@ def test_error_with_multi_labels(): nlp.initialize(get_examples=lambda: train_examples) +# fmt: off +@pytest.mark.parametrize( + "name,textcat_config", + [ + # ENSEMBLE V2 + ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), + ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), + # PARAMETRIC ATTENTION V1 + ("textcat", {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": False}), + # REDUCE + ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), + ], +) +# fmt: on +def test_tok2vec_lazy_init(name, textcat_config): + # Check that we can properly initialize and use a textcat model using + # a lazily-initialized tok2vec. + nlp = English() + pipe_config = {"model": textcat_config} + textcat = nlp.add_pipe(name, config=pipe_config) + textcat.add_label("POSITIVE") + textcat.add_label("NEGATIVE") + nlp.initialize() + nlp.pipe(["This is a test."]) + + @pytest.mark.parametrize( "name,get_examples, train_data", [ diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index ff53ed1e1b0..7b729d78f21 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -12,7 +12,7 @@ import spacy from spacy import about -from spacy.cli import info +from spacy.cli import download_module, info from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory from spacy.cli.apply import apply from spacy.cli.debug_data import ( @@ -1066,3 +1066,15 @@ def test_debug_data_trainable_lemmatizer_not_annotated(): def test_project_api_imports(): from spacy.cli import project_run from spacy.cli.project.run import project_run # noqa: F401, F811 + + +def test_download_rejects_relative_urls(monkeypatch): + """Test that we can't tell spacy download to get an arbitrary model by using a + relative path in the filename""" + + monkeypatch.setattr(download_module, "run_command", lambda cmd: None) + + # Check that normal download works + download_module.download("en_core_web_sm-3.7.1", direct=True) + with pytest.raises(SystemExit): + download_module.download("../en_core_web_sm-3.7.1", direct=True) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index e044edf2da9..6ed0f44eab9 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -1,5 +1,6 @@ import itertools import logging +import warnings from unittest import mock import pytest @@ -423,7 +424,7 @@ def test_language_pipe_error_handler(n_process): nlp.set_error_handler(raise_error) with pytest.raises(ValueError): list(nlp.pipe(texts, n_process=n_process)) - # set explicitely to ignoring + # set explicitly to ignoring nlp.set_error_handler(ignore_error) docs = list(nlp.pipe(texts, n_process=n_process)) assert len(docs) == 0 @@ -834,9 +835,13 @@ def test_pass_doc_to_pipeline(nlp, n_process): assert doc.text == texts[0] assert len(doc.cats) > 0 if isinstance(get_current_ops(), NumpyOps) or n_process < 2: - docs = nlp.pipe(docs, n_process=n_process) - assert [doc.text for doc in docs] == texts - assert all(len(doc.cats) for doc in docs) + # Catch warnings to ensure that all worker processes exited + # succesfully. + with warnings.catch_warnings(): + warnings.simplefilter("error") + docs = nlp.pipe(docs, n_process=n_process) + assert [doc.text for doc in docs] == texts + assert all(len(doc.cats) for doc in docs) def test_invalid_arg_to_pipeline(nlp): diff --git a/spacy/tests/tok2vec.py b/spacy/tests/tok2vec.py new file mode 100644 index 00000000000..7e7b689eb55 --- /dev/null +++ b/spacy/tests/tok2vec.py @@ -0,0 +1,36 @@ +from typing import List + +from thinc.api import Model +from thinc.types import Floats2d + +from spacy.tokens import Doc +from spacy.util import registry + + +@registry.architectures("test.LazyInitTok2Vec.v1") +def build_lazy_init_tok2vec(*, width: int) -> Model[List[Doc], List[Floats2d]]: + """tok2vec model of which the output size is only known after + initialization. This implementation does not output meaningful + embeddings, it is strictly for testing.""" + return Model( + "lazy_init_tok2vec", + lazy_init_tok2vec_forward, + init=lazy_init_tok2vec_init, + dims={"nO": None}, + attrs={"width": width}, + ) + + +def lazy_init_tok2vec_init(model: Model, X=None, Y=None): + width = model.attrs["width"] + model.set_dim("nO", width) + + +def lazy_init_tok2vec_forward(model: Model, X: List[Doc], is_train: bool): + width = model.get_dim("nO") + Y = [model.ops.alloc2f(len(doc), width) for doc in X] + + def backprop(dY): + return [] + + return Y, backprop diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 189d8f3d570..d79c0ac09f3 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -71,65 +71,72 @@ cdef class Tokenizer: self._special_matcher = PhraseMatcher(self.vocab) self._load_special_cases(rules) - property token_match: - def __get__(self): - return self._token_match - - def __set__(self, token_match): - self._token_match = token_match - self._reload_special_cases() - - property url_match: - def __get__(self): - return self._url_match - - def __set__(self, url_match): - self._url_match = url_match - self._reload_special_cases() - - property prefix_search: - def __get__(self): - return self._prefix_search - - def __set__(self, prefix_search): - self._prefix_search = prefix_search - self._reload_special_cases() - - property suffix_search: - def __get__(self): - return self._suffix_search - - def __set__(self, suffix_search): - self._suffix_search = suffix_search - self._reload_special_cases() - - property infix_finditer: - def __get__(self): - return self._infix_finditer - - def __set__(self, infix_finditer): - self._infix_finditer = infix_finditer - self._reload_special_cases() - - property rules: - def __get__(self): - return self._rules - - def __set__(self, rules): - self._rules = {} - self._flush_cache() - self._flush_specials() - self._cache = PreshMap() - self._specials = PreshMap() - self._load_special_cases(rules) - - property faster_heuristics: - def __get__(self): - return self._faster_heuristics - - def __set__(self, faster_heuristics): - self._faster_heuristics = faster_heuristics - self._reload_special_cases() + @property + def token_match(self): + return self._token_match + + @token_match.setter + def token_match(self, token_match): + self._token_match = token_match + self._reload_special_cases() + + @property + def url_match(self): + return self._url_match + + @url_match.setter + def url_match(self, url_match): + self._url_match = url_match + self._reload_special_cases() + + @property + def prefix_search(self): + return self._prefix_search + + @prefix_search.setter + def prefix_search(self, prefix_search): + self._prefix_search = prefix_search + self._reload_special_cases() + + @property + def suffix_search(self): + return self._suffix_search + + @suffix_search.setter + def suffix_search(self, suffix_search): + self._suffix_search = suffix_search + self._reload_special_cases() + + @property + def infix_finditer(self): + return self._infix_finditer + + @infix_finditer.setter + def infix_finditer(self, infix_finditer): + self._infix_finditer = infix_finditer + self._reload_special_cases() + + @property + def rules(self): + return self._rules + + @rules.setter + def rules(self, rules): + self._rules = {} + self._flush_cache() + self._flush_specials() + self._cache = PreshMap() + self._specials = PreshMap() + self._load_special_cases(rules) + + @property + def faster_heuristics(self): + return self._faster_heuristics + + @faster_heuristics.setter + def faster_heuristics(self, faster_heuristics): + self._faster_heuristics = faster_heuristics + self._reload_special_cases() def __reduce__(self): args = (self.vocab, diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 14fbb71c3c2..1ccbc24abc5 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -667,7 +667,8 @@ cdef class Doc: else: return False - property vector: + @property + def vector(self): """A real-valued meaning representation. Defaults to an average of the token vectors. @@ -676,45 +677,46 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#vector """ - def __get__(self): - if "vector" in self.user_hooks: - return self.user_hooks["vector"](self) - if self._vector is not None: - return self._vector - xp = get_array_module(self.vocab.vectors.data) - if not len(self): - self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") - return self._vector - elif self.vocab.vectors.size > 0: - self._vector = sum(t.vector for t in self) / len(self) - return self._vector - else: - return xp.zeros((self.vocab.vectors_length,), dtype="float32") + if "vector" in self.user_hooks: + return self.user_hooks["vector"](self) + if self._vector is not None: + return self._vector + xp = get_array_module(self.vocab.vectors.data) + if not len(self): + self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") + return self._vector + elif self.vocab.vectors.size > 0: + self._vector = sum(t.vector for t in self) / len(self) + return self._vector + else: + return xp.zeros((self.vocab.vectors_length,), dtype="float32") - def __set__(self, value): - self._vector = value + @vector.setter + def vector(self, value): + self._vector = value - property vector_norm: + @property + def vector_norm(self): """The L2 norm of the document's vector representation. RETURNS (float): The L2 norm of the vector representation. DOCS: https://spacy.io/api/doc#vector_norm """ - def __get__(self): - if "vector_norm" in self.user_hooks: - return self.user_hooks["vector_norm"](self) - cdef float value - cdef double norm = 0 - if self._vector_norm is None: - norm = 0.0 - for value in self.vector: - norm += value * value - self._vector_norm = sqrt(norm) if norm != 0 else 0 - return self._vector_norm - - def __set__(self, value): - self._vector_norm = value + if "vector_norm" in self.user_hooks: + return self.user_hooks["vector_norm"](self) + cdef float value + cdef double norm = 0 + if self._vector_norm is None: + norm = 0.0 + for value in self.vector: + norm += value * value + self._vector_norm = sqrt(norm) if norm != 0 else 0 + return self._vector_norm + + @vector_norm.setter + def vector_norm(self, value): + self._vector_norm = value @property def text(self): @@ -733,7 +735,8 @@ cdef class Doc: """ return self.text - property ents: + @property + def ents(self): """The named entities in the document. Returns a list of named entity `Span` objects, if the entity recognizer has been applied. @@ -741,55 +744,55 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#ents """ - def __get__(self): - cdef int i - cdef const TokenC* token - cdef int start = -1 - cdef attr_t label = 0 - cdef attr_t kb_id = 0 - cdef attr_t ent_id = 0 - output = [] - for i in range(self.length): - token = &self.c[i] - if token.ent_iob == 1: - if start == -1: - seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]] - raise ValueError(Errors.E093.format(seq=" ".join(seq))) - elif token.ent_iob == 2 or token.ent_iob == 0 or \ - (token.ent_iob == 3 and token.ent_type == 0): - if start != -1: - output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) - start = -1 - label = 0 - kb_id = 0 - ent_id = 0 - elif token.ent_iob == 3: - if start != -1: - output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) - start = i - label = token.ent_type - kb_id = token.ent_kb_id - ent_id = token.ent_id - if start != -1: - output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id)) - # remove empty-label spans - output = [o for o in output if o.label_ != ""] - return tuple(output) - - def __set__(self, ents): - # TODO: - # 1. Test basic data-driven ORTH gazetteer - # 2. Test more nuanced date and currency regex - cdef attr_t kb_id, ent_id - cdef int ent_start, ent_end - ent_spans = [] - for ent_info in ents: - entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info) - if isinstance(entity_type_, str): - self.vocab.strings.add(entity_type_) - span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id) - ent_spans.append(span) - self.set_ents(ent_spans, default=SetEntsDefault.outside) + cdef int i + cdef const TokenC* token + cdef int start = -1 + cdef attr_t label = 0 + cdef attr_t kb_id = 0 + cdef attr_t ent_id = 0 + output = [] + for i in range(self.length): + token = &self.c[i] + if token.ent_iob == 1: + if start == -1: + seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]] + raise ValueError(Errors.E093.format(seq=" ".join(seq))) + elif token.ent_iob == 2 or token.ent_iob == 0 or \ + (token.ent_iob == 3 and token.ent_type == 0): + if start != -1: + output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) + start = -1 + label = 0 + kb_id = 0 + ent_id = 0 + elif token.ent_iob == 3: + if start != -1: + output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) + start = i + label = token.ent_type + kb_id = token.ent_kb_id + ent_id = token.ent_id + if start != -1: + output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id)) + # remove empty-label spans + output = [o for o in output if o.label_ != ""] + return tuple(output) + + @ents.setter + def ents(self, ents): + # TODO: + # 1. Test basic data-driven ORTH gazetteer + # 2. Test more nuanced date and currency regex + cdef attr_t kb_id, ent_id + cdef int ent_start, ent_end + ent_spans = [] + for ent_info in ents: + entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info) + if isinstance(entity_type_, str): + self.vocab.strings.add(entity_type_) + span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id) + ent_spans.append(span) + self.set_ents(ent_spans, default=SetEntsDefault.outside) def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside): """Set entity annotation. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 203e5db52cc..11937231796 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -786,110 +786,130 @@ cdef class Span: for word in self.rights: yield from word.subtree - property start: - def __get__(self): - return self.span_c().start - - def __set__(self, int start): - if start < 0 or start > self.doc.length: - raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start)) - cdef SpanC* span_c = self.span_c() - if start > span_c.end: - raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end)) - span_c.start = start - span_c.start_char = self.doc.c[start].idx - - property end: - def __get__(self): - return self.span_c().end - - def __set__(self, int end): - if end < 0 or end > self.doc.length: - raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end)) - cdef SpanC* span_c = self.span_c() - if span_c.start > end: - raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start)) - span_c.end = end - if end > 0: - span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length - else: - span_c.end_char = 0 + @property + def start(self): + return self.span_c().start + + @start.setter + def start(self, int start): + if start < 0 or start > self.doc.length: + raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start)) + cdef SpanC * span_c = self.span_c() + if start > span_c.end: + raise ValueError( + Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end)) + span_c.start = start + span_c.start_char = self.doc.c[start].idx - property start_char: - def __get__(self): - return self.span_c().start_char - - def __set__(self, int start_char): - if start_char < 0 or start_char > len(self.doc.text): - raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char)) - cdef int start = token_by_start(self.doc.c, self.doc.length, start_char) - if start < 0: - raise ValueError(Errors.E4008.format(value=start_char, pos="start")) - cdef SpanC* span_c = self.span_c() - if start_char > span_c.end_char: - raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char)) - span_c.start_char = start_char - span_c.start = start - - property end_char: - def __get__(self): - return self.span_c().end_char - - def __set__(self, int end_char): - if end_char < 0 or end_char > len(self.doc.text): - raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char)) - cdef int end = token_by_end(self.doc.c, self.doc.length, end_char) - if end < 0: - raise ValueError(Errors.E4008.format(value=end_char, pos="end")) - cdef SpanC* span_c = self.span_c() - if span_c.start_char > end_char: - raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char)) - span_c.end_char = end_char - span_c.end = end - - property label: - def __get__(self): - return self.span_c().label + @property + def end(self): + return self.span_c().end + + @end.setter + def end(self, int end): + if end < 0 or end > self.doc.length: + raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end)) + cdef SpanC * span_c = self.span_c() + if span_c.start > end: + raise ValueError( + Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start)) + span_c.end = end + if end > 0: + span_c.end_char = self.doc.c[end - 1].idx + self.doc.c[end - 1].lex.length + else: + span_c.end_char = 0 - def __set__(self, attr_t label): - if label != self.span_c().label : - old_label = self.span_c().label - self.span_c().label = label - new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) - old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id) - Underscore._replace_keys(old, new) + @property + def start_char(self): + return self.span_c().start_char + + @start_char.setter + def start_char(self, int start_char): + if start_char < 0 or start_char > len(self.doc.text): + raise IndexError( + Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char)) + cdef int start = token_by_start(self.doc.c, self.doc.length, start_char) + if start < 0: + raise ValueError(Errors.E4008.format(value=start_char, pos="start")) + cdef SpanC * span_c = self.span_c() + if start_char > span_c.end_char: + raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", + existing_value=span_c.end_char)) + span_c.start_char = start_char + span_c.start = start - property kb_id: - def __get__(self): - return self.span_c().kb_id + @property + def end_char(self): + return self.span_c().end_char + + @end_char.setter + def end_char(self, int end_char): + if end_char < 0 or end_char > len(self.doc.text): + raise IndexError( + Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char)) + cdef int end = token_by_end(self.doc.c, self.doc.length, end_char) + if end < 0: + raise ValueError(Errors.E4008.format(value=end_char, pos="end")) + cdef SpanC * span_c = self.span_c() + if span_c.start_char > end_char: + raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", + existing_value=span_c.start_char)) + span_c.end_char = end_char + span_c.end = end - def __set__(self, attr_t kb_id): - if kb_id != self.span_c().kb_id : - old_kb_id = self.span_c().kb_id - self.span_c().kb_id = kb_id - new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) - old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id) - Underscore._replace_keys(old, new) + @property + def label(self): + return self.span_c().label + + @label.setter + def label(self, attr_t label): + if label != self.span_c().label: + old_label = self.span_c().label + self.span_c().label = label + new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, + end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) + old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, + end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id) + Underscore._replace_keys(old, new) - property id: - def __get__(self): - return self.span_c().id + @property + def kb_id(self): + return self.span_c().kb_id + + @kb_id.setter + def kb_id(self, attr_t kb_id): + if kb_id != self.span_c().kb_id: + old_kb_id = self.span_c().kb_id + self.span_c().kb_id = kb_id + new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, + end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) + old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, + end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id) + Underscore._replace_keys(old, new) - def __set__(self, attr_t id): - if id != self.span_c().id : - old_id = self.span_c().id - self.span_c().id = id - new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) - old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id) - Underscore._replace_keys(old, new) + @property + def id(self): + return self.span_c().id + + @id.setter + def id(self, attr_t id): + if id != self.span_c().id: + old_id = self.span_c().id + self.span_c().id = id + new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, + end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) + old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, + end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id) + Underscore._replace_keys(old, new) - property ent_id: + @property + def ent_id(self): """Alias for the span's ID.""" - def __get__(self): - return self.id + return self.id - def __set__(self, attr_t ent_id): - self.id = ent_id + @ent_id.setter + def ent_id(self, attr_t ent_id): + self.id = ent_id @property def orth_(self): @@ -904,29 +924,32 @@ cdef class Span: """RETURNS (str): The span's lemma.""" return "".join([t.lemma_ + t.whitespace_ for t in self]).strip() - property label_: + @property + def label_(self): """The span's label.""" - def __get__(self): - return self.doc.vocab.strings[self.label] + return self.doc.vocab.strings[self.label] - def __set__(self, str label_): - self.label = self.doc.vocab.strings.add(label_) + @label_.setter + def label_(self, str label_): + self.label = self.doc.vocab.strings.add(label_) - property kb_id_: + @property + def kb_id_(self): """The span's KB ID.""" - def __get__(self): - return self.doc.vocab.strings[self.kb_id] + return self.doc.vocab.strings[self.kb_id] - def __set__(self, str kb_id_): - self.kb_id = self.doc.vocab.strings.add(kb_id_) + @kb_id_.setter + def kb_id_(self, str kb_id_): + self.kb_id = self.doc.vocab.strings.add(kb_id_) - property id_: + @property + def id_(self): """The span's ID.""" - def __get__(self): - return self.doc.vocab.strings[self.id] + return self.doc.vocab.strings[self.id] - def __set__(self, str id_): - self.id = self.doc.vocab.strings.add(id_) + @id_.setter + def id_(self, str id_): + self.id = self.doc.vocab.strings.add(id_) property ent_id_: """Alias for the span's ID.""" diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 358209efb53..5fef5c7e805 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -250,15 +250,16 @@ cdef class Token: """ return not self.c.morph == 0 - property morph: - def __get__(self): - return MorphAnalysis.from_id(self.vocab, self.c.morph) + @property + def morph(self): + return MorphAnalysis.from_id(self.vocab, self.c.morph) - def __set__(self, MorphAnalysis morph): - # Check that the morph has the same vocab - if self.vocab != morph.vocab: - raise ValueError(Errors.E1013) - self.c.morph = deref(morph.c).key + @morph.setter + def morph(self, MorphAnalysis morph): + # Check that the morph has the same vocab + if self.vocab != morph.vocab: + raise ValueError(Errors.E1013) + self.c.morph = deref(morph.c).key def set_morph(self, features): cdef hash_t key @@ -370,39 +371,43 @@ cdef class Token: """ return self.c.lex.suffix - property lemma: + @property + def lemma(self): """RETURNS (uint64): ID of the base form of the word, with no inflectional suffixes. """ - def __get__(self): - return self.c.lemma + return self.c.lemma - def __set__(self, attr_t lemma): - self.c.lemma = lemma + @lemma.setter + def lemma(self, attr_t lemma): + self.c.lemma = lemma - property pos: + @property + def pos(self): """RETURNS (uint64): ID of coarse-grained part-of-speech tag.""" - def __get__(self): - return self.c.pos + return self.c.pos - def __set__(self, pos): - self.c.pos = pos + @pos.setter + def pos(self, pos): + self.c.pos = pos - property tag: + @property + def tag(self): """RETURNS (uint64): ID of fine-grained part-of-speech tag.""" - def __get__(self): - return self.c.tag + return self.c.tag - def __set__(self, attr_t tag): - self.c.tag = tag + @tag.setter + def tag(self, attr_t tag): + self.c.tag = tag - property dep: + @property + def dep(self): """RETURNS (uint64): ID of syntactic dependency label.""" - def __get__(self): - return self.c.dep + return self.c.dep - def __set__(self, attr_t label): - self.c.dep = label + @dep.setter + def dep(self, attr_t label): + self.c.dep = label @property def has_vector(self): @@ -483,48 +488,51 @@ cdef class Token: return self.doc.user_token_hooks["sent"](self) return self.doc[self.i : self.i+1].sent - property sent_start: - def __get__(self): - """Deprecated: use Token.is_sent_start instead.""" - # Raising a deprecation warning here causes errors for autocomplete - # Handle broken backwards compatibility case: doc[0].sent_start - # was False. - if self.i == 0: - return False - else: - return self.c.sent_start + @property + def sent_start(self): + """Deprecated: use Token.is_sent_start instead.""" + # Raising a deprecation warning here causes errors for autocomplete + # Handle broken backwards compatibility case: doc[0].sent_start + # was False. + if self.i == 0: + return False + else: + return self.c.sent_start - def __set__(self, value): - self.is_sent_start = value + @sent_start.setter + def sent_start(self, value): + self.is_sent_start = value - property is_sent_start: + @property + def is_sent_start(self): """A boolean value indicating whether the token starts a sentence. `None` if unknown. Defaults to `True` for the first token in the `Doc`. RETURNS (bool / None): Whether the token starts a sentence. None if unknown. """ - def __get__(self): - if self.c.sent_start == 0: - return None - elif self.c.sent_start < 0: - return False - else: - return True - - def __set__(self, value): - if self.doc.has_annotation("DEP"): - raise ValueError(Errors.E043) - if value is None: - self.c.sent_start = 0 - elif value is True: - self.c.sent_start = 1 - elif value is False: - self.c.sent_start = -1 - else: - raise ValueError(Errors.E044.format(value=value)) + if self.c.sent_start == 0: + return None + elif self.c.sent_start < 0: + return False + else: + return True + + @is_sent_start.setter + def is_sent_start(self, value): + if self.doc.has_annotation("DEP"): + raise ValueError(Errors.E043) + if value is None: + self.c.sent_start = 0 + elif value is True: + self.c.sent_start = 1 + elif value is False: + self.c.sent_start = -1 + else: + raise ValueError(Errors.E044.format(value=value)) - property is_sent_end: + @property + def is_sent_end(self): """A boolean value indicating whether the token ends a sentence. `None` if unknown. Defaults to `True` for the last token in the `Doc`. @@ -533,18 +541,18 @@ cdef class Token: DOCS: https://spacy.io/api/token#is_sent_end """ - def __get__(self): - if self.i + 1 == len(self.doc): - return True - elif self.doc[self.i+1].is_sent_start is None: - return None - elif self.doc[self.i+1].is_sent_start is True: - return True - else: - return False + if self.i + 1 == len(self.doc): + return True + elif self.doc[self.i+1].is_sent_start is None: + return None + elif self.doc[self.i+1].is_sent_start is True: + return True + else: + return False - def __set__(self, value): - raise ValueError(Errors.E196) + @is_sent_end.setter + def is_sent_end(self, value): + raise ValueError(Errors.E196) @property def lefts(self): @@ -671,41 +679,42 @@ cdef class Token: """ return not Token.missing_head(self.c) - property head: + @property + def head(self): """The syntactic parent, or "governor", of this token. If token.has_head() is `False`, this method will return itself. RETURNS (Token): The token predicted by the parser to be the head of the current token. """ - def __get__(self): - if not self.has_head(): - return self - else: - return self.doc[self.i + self.c.head] - - def __set__(self, Token new_head): - # This function sets the head of self to new_head and updates the - # counters for left/right dependents and left/right corner for the - # new and the old head - # Check that token is from the same document - if self.doc != new_head.doc: - raise ValueError(Errors.E191) - # Do nothing if old head is new head - if self.i + self.c.head == new_head.i: - return - # Find the widest l/r_edges of the roots of the two tokens involved - # to limit the number of tokens for set_children_from_heads - cdef Token self_root, new_head_root - self_root = ([self] + list(self.ancestors))[-1] - new_head_ancestors = list(new_head.ancestors) - new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head - start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge - end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge - # Set new head - self.c.head = new_head.i - self.i - # Adjust parse properties and sentence starts - set_children_from_heads(self.doc.c, start, end + 1) + if not self.has_head(): + return self + else: + return self.doc[self.i + self.c.head] + + @head.setter + def head(self, Token new_head): + # This function sets the head of self to new_head and updates the + # counters for left/right dependents and left/right corner for the + # new and the old head + # Check that token is from the same document + if self.doc != new_head.doc: + raise ValueError(Errors.E191) + # Do nothing if old head is new head + if self.i + self.c.head == new_head.i: + return + # Find the widest l/r_edges of the roots of the two tokens involved + # to limit the number of tokens for set_children_from_heads + cdef Token self_root, new_head_root + self_root = ([self] + list(self.ancestors))[-1] + new_head_ancestors = list(new_head.ancestors) + new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head + start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge + end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge + # Set new head + self.c.head = new_head.i - self.i + # Adjust parse properties and sentence starts + set_children_from_heads(self.doc.c, start, end + 1) @property def conjuncts(self): @@ -733,21 +742,23 @@ cdef class Token: queue.append(child) return tuple([w for w in output if w.i != self.i]) - property ent_type: + @property + def ent_type(self): """RETURNS (uint64): Named entity type.""" - def __get__(self): - return self.c.ent_type + return self.c.ent_type - def __set__(self, ent_type): - self.c.ent_type = ent_type + @ent_type.setter + def ent_type(self, ent_type): + self.c.ent_type = ent_type - property ent_type_: + @property + def ent_type_(self): """RETURNS (str): Named entity type.""" - def __get__(self): - return self.vocab.strings[self.c.ent_type] + return self.vocab.strings[self.c.ent_type] - def __set__(self, ent_type): - self.c.ent_type = self.vocab.strings.add(ent_type) + @ent_type_.setter + def ent_type_(self, ent_type): + self.c.ent_type = self.vocab.strings.add(ent_type) @property def ent_iob(self): @@ -773,41 +784,45 @@ cdef class Token: """ return self.iob_strings()[self.c.ent_iob] - property ent_id: + @property + def ent_id(self): """RETURNS (uint64): ID of the entity the token is an instance of, if any. """ - def __get__(self): - return self.c.ent_id + return self.c.ent_id - def __set__(self, hash_t key): - self.c.ent_id = key + @ent_id.setter + def ent_id(self, hash_t key): + self.c.ent_id = key - property ent_id_: + @property + def ent_id_(self): """RETURNS (str): ID of the entity the token is an instance of, if any. """ - def __get__(self): - return self.vocab.strings[self.c.ent_id] + return self.vocab.strings[self.c.ent_id] - def __set__(self, name): - self.c.ent_id = self.vocab.strings.add(name) + @ent_id_.setter + def ent_id_(self, name): + self.c.ent_id = self.vocab.strings.add(name) - property ent_kb_id: + @property + def ent_kb_id(self): """RETURNS (uint64): Named entity KB ID.""" - def __get__(self): - return self.c.ent_kb_id + return self.c.ent_kb_id - def __set__(self, attr_t ent_kb_id): - self.c.ent_kb_id = ent_kb_id + @ent_kb_id.setter + def ent_kb_id(self, attr_t ent_kb_id): + self.c.ent_kb_id = ent_kb_id - property ent_kb_id_: + @property + def ent_kb_id_(self): """RETURNS (str): Named entity KB ID.""" - def __get__(self): - return self.vocab.strings[self.c.ent_kb_id] + return self.vocab.strings[self.c.ent_kb_id] - def __set__(self, ent_kb_id): - self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id) + @ent_kb_id_.setter + def ent_kb_id_(self, ent_kb_id): + self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id) @property def whitespace_(self): @@ -829,16 +844,17 @@ cdef class Token: """ return self.vocab.strings[self.c.lex.lower] - property norm_: + @property + def norm_(self): """RETURNS (str): The token's norm, i.e. a normalised form of the token text. Usually set in the language's tokenizer exceptions or norm exceptions. """ - def __get__(self): - return self.vocab.strings[self.norm] + return self.vocab.strings[self.norm] - def __set__(self, str norm_): - self.c.norm = self.vocab.strings.add(norm_) + @norm_.setter + def norm_(self, str norm_): + self.c.norm = self.vocab.strings.add(norm_) @property def shape_(self): @@ -868,33 +884,36 @@ cdef class Token: """ return self.vocab.strings[self.c.lex.lang] - property lemma_: + @property + def lemma_(self): """RETURNS (str): The token lemma, i.e. the base form of the word, with no inflectional suffixes. """ - def __get__(self): - return self.vocab.strings[self.c.lemma] + return self.vocab.strings[self.c.lemma] - def __set__(self, str lemma_): - self.c.lemma = self.vocab.strings.add(lemma_) + @lemma_.setter + def lemma_(self, str lemma_): + self.c.lemma = self.vocab.strings.add(lemma_) - property pos_: + @property + def pos_(self): """RETURNS (str): Coarse-grained part-of-speech tag.""" - def __get__(self): - return parts_of_speech.NAMES[self.c.pos] + return parts_of_speech.NAMES[self.c.pos] - def __set__(self, pos_name): - if pos_name not in parts_of_speech.IDS: - raise ValueError(Errors.E1021.format(pp=pos_name)) - self.c.pos = parts_of_speech.IDS[pos_name] + @pos_.setter + def pos_(self, pos_name): + if pos_name not in parts_of_speech.IDS: + raise ValueError(Errors.E1021.format(pp=pos_name)) + self.c.pos = parts_of_speech.IDS[pos_name] - property tag_: + @property + def tag_(self): """RETURNS (str): Fine-grained part-of-speech tag.""" - def __get__(self): - return self.vocab.strings[self.c.tag] + return self.vocab.strings[self.c.tag] - def __set__(self, tag): - self.tag = self.vocab.strings.add(tag) + @tag_.setter + def tag_(self, tag): + self.tag = self.vocab.strings.add(tag) def has_dep(self): """Check whether the token has annotated dep information. @@ -904,13 +923,14 @@ cdef class Token: """ return not Token.missing_dep(self.c) - property dep_: + @property + def dep_(self): """RETURNS (str): The syntactic dependency label.""" - def __get__(self): - return self.vocab.strings[self.c.dep] + return self.vocab.strings[self.c.dep] - def __set__(self, str label): - self.c.dep = self.vocab.strings.add(label) + @dep_.setter + def dep_(self, str label): + self.c.dep = self.vocab.strings.add(label) @property def is_oov(self): diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 9cc9e04ab78..f70e738a7fa 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -101,23 +101,25 @@ cdef class Example: def __len__(self): return len(self.predicted) - property predicted: - def __get__(self): - return self.x + @property + def predicted(self): + return self.x - def __set__(self, doc): - self.x = doc - self._cached_alignment = None - self._cached_words_x = [t.text for t in doc] + @predicted.setter + def predicted(self, doc): + self.x = doc + self._cached_alignment = None + self._cached_words_x = [t.text for t in doc] - property reference: - def __get__(self): - return self.y + @property + def reference(self): + return self.y - def __set__(self, doc): - self.y = doc - self._cached_alignment = None - self._cached_words_y = [t.text for t in doc] + @reference.setter + def reference(self, doc): + self.y = doc + self._cached_alignment = None + self._cached_words_y = [t.text for t in doc] def copy(self): return Example( @@ -433,9 +435,9 @@ cdef class Example: seen_indices.update(indices) return output - property text: - def __get__(self): - return self.x.text + @property + def text(self): + return self.x.text def __str__(self): return str(self.to_dict()) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 6cd6da56cbf..7d5608e5284 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -87,16 +87,17 @@ cdef class Vocab: self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks - property vectors: - def __get__(self): - return self._vectors + @property + def vectors(self): + return self._vectors - def __set__(self, vectors): - if hasattr(vectors, "strings"): - for s in vectors.strings: - self.strings.add(s) - self._vectors = vectors - self._vectors.strings = self.strings + @vectors.setter + def vectors(self, vectors): + if hasattr(vectors, "strings"): + for s in vectors.strings: + self.strings.add(s) + self._vectors = vectors + self._vectors.strings = self.strings @property def lang(self): @@ -450,17 +451,18 @@ cdef class Vocab: key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) return key in self.vectors - property lookups: - def __get__(self): - return self._lookups - - def __set__(self, lookups): - self._lookups = lookups - if lookups.has_table("lexeme_norm"): - self.lex_attr_getters[NORM] = util.add_lookups( - self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), - self.lookups.get_table("lexeme_norm"), - ) + @property + def lookups(self): + return self._lookups + + @lookups.setter + def lookups(self, lookups): + self._lookups = lookups + if lookups.has_table("lexeme_norm"): + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), + self.lookups.get_table("lexeme_norm"), + ) def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. diff --git a/website/docs/api/attributes.mdx b/website/docs/api/attributes.mdx index 3142b741d9a..9cb76ac5842 100644 --- a/website/docs/api/attributes.mdx +++ b/website/docs/api/attributes.mdx @@ -45,33 +45,33 @@ For attributes that represent string values, the internal integer ID is accessed as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by appending `_` as in `token.dep_`. -| Attribute | Description | -| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `DEP` | The token's dependency label. ~~str~~ | -| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | -| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | -| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | -| `ENT_TYPE` | The token's entity label. ~~str~~ | -| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | -| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ | -| `IS_DIGIT` | Token text consists of digits. ~~bool~~ | -| `IS_LOWER` | Token text is in lowercase. ~~bool~~ | -| `IS_PUNCT` | Token is punctuation. ~~bool~~ | -| `IS_SPACE` | Token is whitespace. ~~bool~~ | -| `IS_STOP` | Token is a stop word. ~~bool~~ | -| `IS_TITLE` | Token text is in titlecase. ~~bool~~ | -| `IS_UPPER` | Token text is in uppercase. ~~bool~~ | -| `LEMMA` | The token's lemma. ~~str~~ | -| `LENGTH` | The length of the token text. ~~int~~ | -| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ | -| `LIKE_NUM` | Token text resembles a number. ~~bool~~ | -| `LIKE_URL` | Token text resembles a URL. ~~bool~~ | -| `LOWER` | The lowercase form of the token text. ~~str~~ | -| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ | -| `NORM` | The normalized form of the token text. ~~str~~ | -| `ORTH` | The exact verbatim text of a token. ~~str~~ | -| `POS` | The token's universal part of speech (UPOS). ~~str~~ | -| `SENT_START` | Token is start of sentence. ~~bool~~ | -| `SHAPE` | The token's shape. ~~str~~ | -| `SPACY` | Token has a trailing space. ~~bool~~ | -| `TAG` | The token's fine-grained part of speech. ~~str~~ | +| Attribute | Description | +| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `DEP` | The token's dependency label. ~~str~~ | +| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | +| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | +| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | +| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ | +| `IS_DIGIT` | Token text consists of digits. ~~bool~~ | +| `IS_LOWER` | Token text is in lowercase. ~~bool~~ | +| `IS_PUNCT` | Token is punctuation. ~~bool~~ | +| `IS_SPACE` | Token is whitespace. ~~bool~~ | +| `IS_STOP` | Token is a stop word. ~~bool~~ | +| `IS_TITLE` | Token text is in titlecase. ~~bool~~ | +| `IS_UPPER` | Token text is in uppercase. ~~bool~~ | +| `LEMMA` | The token's lemma. ~~str~~ | +| `LENGTH` | The length of the token text. ~~int~~ | +| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ | +| `LIKE_NUM` | Token text resembles a number. ~~bool~~ | +| `LIKE_URL` | Token text resembles a URL. ~~bool~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ | +| `NORM` | The normalized form of the token text. ~~str~~ | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `POS` | The token's universal part of speech (UPOS). ~~str~~ | +| `SENT_START` | Token is start of sentence. ~~bool~~ | +| `SHAPE` | The token's shape. ~~str~~ | +| `SPACY` | Token has a trailing space. ~~bool~~ | +| `TAG` | The token's fine-grained part of speech. ~~str~~ | diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index a4daaadb350..6a8ccd0c9c1 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -566,7 +566,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) ✔ Good amount of examples for all labels -✔ Examples without occurences available for all labels +✔ Examples without occurrences available for all labels ✔ No entities consisting of or starting/ending with whitespace =========================== Part-of-speech Tagging =========================== @@ -1322,7 +1322,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] ## find-threshold {id="find-threshold",version="3.5",tag="command"} -Runs prediction trials for a trained model with varying tresholds to maximize +Runs prediction trials for a trained model with varying thresholds to maximize the specified metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 08e2151c03f..30eff16704a 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -61,13 +61,13 @@ architectures and their arguments and hyperparameters. | `incl_context` | Whether the local context is included in the model. Defaults to `True`. ~~bool~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [`EntityLinker`](/api/architectures#EntityLinker). ~~Model~~ | | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether entities are copied from the gold docs. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `use_gold_ents` | Whether entities are copied from the gold docs. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~ | | `get_candidates` 4.0 | Function that retrieves plausible candidates per entity mention in a given `Iterator[SpanGroup]` (one `SpanGroup` includes all mentions found in a given `Doc` instance). Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ | | `generate_empty_kb` 3.6 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | | `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | @@ -114,21 +114,21 @@ custom knowledge base, you should either call [`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the [`initialize`](/api/entitylinker#initialize) call. -| Name | Description | -| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | -| `get_candidates` | Function that retrieves plausible candidates per entity mention in a given `SpanGroup`. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ | -| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| Name | Description | +| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | +| `get_candidates` | Function that retrieves plausible candidates per entity mention in a given `SpanGroup`. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~ | +| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ## EntityLinker.\_\_call\_\_ {id="call",tag="method"} diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx index adb1f14d40e..d347442fe91 100644 --- a/website/docs/api/entityruler.mdx +++ b/website/docs/api/entityruler.mdx @@ -69,7 +69,7 @@ how the component should be configured. You can override its settings via the | Setting | Description | | ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx index 018ce25245e..7f6802034d2 100644 --- a/website/docs/api/morphology.mdx +++ b/website/docs/api/morphology.mdx @@ -147,9 +147,10 @@ Whether a feature/value pair is in the analysis. > assert "Feat1=Val1" in morph > ``` -| Name | Description | -| ----------- | --------------------------------------------- | -| **RETURNS** | A feature/value pair in the analysis. ~~str~~ | +| Name | Description | +| ------------ | --------------------------------------------------------------------- | +| `feature` | A feature/value pair. ~~str~~ | +| **RETURNS** | Whether the feature/value pair is contained in the analysis. ~~bool~~ | ### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"} diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index e1ada3b45ff..7342543d40a 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -287,7 +287,7 @@ does not permit other NPs to be nested within it – so no NP-level coordination no prepositional phrases, and no relative clauses. If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) -has not been implemeted for the given language, a `NotImplementedError` is +has not been implemented for the given language, a `NotImplementedError` is raised. > #### Example diff --git a/website/docs/api/transformer.mdx b/website/docs/api/transformer.mdx index 8f024553dac..9dcafb55782 100644 --- a/website/docs/api/transformer.mdx +++ b/website/docs/api/transformer.mdx @@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the | `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | | `width` | The width of the last hidden layer. ~~int~~ | -### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"} +### TransformerData.empty {id="transformerdata-empty",tag="classmethod"} Create an empty `TransformerData` container. diff --git a/website/docs/usage/layers-architectures.mdx b/website/docs/usage/layers-architectures.mdx index 03b85f5af91..344c66e8db2 100644 --- a/website/docs/usage/layers-architectures.mdx +++ b/website/docs/usage/layers-architectures.mdx @@ -832,7 +832,7 @@ retrieve and add to them. After creation, the component needs to be [initialized](/usage/training#initialization). This method can define the -relevant labels in two ways: explicitely by setting the `labels` argument in the +relevant labels in two ways: explicitly by setting the `labels` argument in the [`initialize` block](/api/data-formats#config-initialize) of the config, or implicately by deducing them from the `get_examples` callback that generates the full **training data set**, or a representative sample. diff --git a/website/docs/usage/linguistic-features.mdx b/website/docs/usage/linguistic-features.mdx index 26d1ad37962..57b95ee7b4f 100644 --- a/website/docs/usage/linguistic-features.mdx +++ b/website/docs/usage/linguistic-features.mdx @@ -1899,7 +1899,7 @@ the two words. "Shore": ("coast", 0.732257), "Precautionary": ("caution", 0.490973), "hopelessness": ("sadness", 0.742366), - "Continous": ("continuous", 0.732549), + "Continuous": ("continuous", 0.732549), "Disemboweled": ("corpse", 0.499432), "biostatistician": ("scientist", 0.339724), "somewheres": ("somewheres", 0.402736), diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx index 5b783002c00..d9db2fc0cb2 100644 --- a/website/docs/usage/models.mdx +++ b/website/docs/usage/models.mdx @@ -530,13 +530,17 @@ application's `requirements.txt`. If you're running your own internal PyPi installation, you can upload the pipeline packages there. pip's [requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/) supports both package names to download via a PyPi server, as well as -[direct URLs](#pipeline-urls). +[direct URLs](#pipeline-urls). For instance, you can specify the +`en_core_web_sm` model for spaCy 3.7.x as follows: ```text {title="requirements.txt"} spacy>=3.0.0,<4.0.0 -en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl +en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl ``` +See the [list of models](https://spacy.io/models) for model download links for +the current spaCy version. + All pipeline packages are versioned and specify their spaCy dependency. This ensures cross-compatibility and lets you specify exact version requirements for each pipeline. If you've [trained](/usage/training) your own pipeline, you can diff --git a/website/docs/usage/projects.mdx b/website/docs/usage/projects.mdx index b089a7ab561..e10ba4c506c 100644 --- a/website/docs/usage/projects.mdx +++ b/website/docs/usage/projects.mdx @@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the dependency check, set `check_requirements: false` in your project's `project.yml`. -### 4. Run a workflow {id="run-workfow"} +### 4. Run a workflow {id="run-workflow"} > #### project.yml > @@ -286,7 +286,7 @@ pipelines. | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). | -| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx index 97ae3c5e573..3712fbeeb80 100644 --- a/website/docs/usage/saving-loading.mdx +++ b/website/docs/usage/saving-loading.mdx @@ -306,7 +306,9 @@ installed in the same environment – that's it. ### Loading probability tables into existing models -You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`. +You can load a probability table from +[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an +existing spaCy model like `en_core_web_sm`. ```python # Requirements: pip install spacy-lookups-data @@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"]) nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob")) ``` -When training a model from scratch you can also specify probability tables in the `config.cfg`. +When training a model from scratch you can also specify probability tables in +the `config.cfg`. ```ini {title="config.cfg (excerpt)"} [initialize.lookups] @@ -346,8 +349,8 @@ them**! To stick with the theme of [this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/), consider the following custom spaCy -[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a -snake when it's called: +[pipeline component](/usage/processing-pipelines#custom-components) that prints +a snake when it's called: > #### Package directory structure > diff --git a/website/docs/usage/v2-2.mdx b/website/docs/usage/v2-2.mdx index 84129657dda..cf4f7c5bf57 100644 --- a/website/docs/usage/v2-2.mdx +++ b/website/docs/usage/v2-2.mdx @@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) ✔ Good amount of examples for all labels -✔ Examples without occurences available for all labels +✔ Examples without occurrences available for all labels ✔ No entities consisting of or starting/ending with whitespace =========================== Part-of-speech Tagging =========================== diff --git a/website/docs/usage/v3-2.mdx b/website/docs/usage/v3-2.mdx index b4a4ef67242..b3ffd5d6820 100644 --- a/website/docs/usage/v3-2.mdx +++ b/website/docs/usage/v3-2.mdx @@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under `TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details in the [transformer API docs](/api/architectures#TransformerModel). -`spacy-transfomers` v1.1 also adds support for `transformer_config` settings +`spacy-transformers` v1.1 also adds support for `transformer_config` settings such as `output_attentions`. Additional output is stored under `TransformerData.model_output`. More details are in the [TransformerModel docs](/api/architectures#TransformerModel). The training speed diff --git a/website/meta/site.json b/website/meta/site.json index f1d318071c4..55fe60ad364 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -23,7 +23,6 @@ }, "docSearch": { "appId": "Y1LB128RON", - "apiKey": "bb601a1daab73e2dc66faf2b79564807", "indexName": "spacy" }, "binderUrl": "explosion/spacy-io-binder", diff --git a/website/next.config.mjs b/website/next.config.mjs index df3b1d01d09..5e2f8f8c34d 100644 --- a/website/next.config.mjs +++ b/website/next.config.mjs @@ -32,6 +32,9 @@ const nextConfig = withPWA( ignoreBuildErrors: true, }, images: { unoptimized: true }, + env: { + DOCSEARCH_API_KEY: process.env.DOCSEARCH_API_KEY + } }) ) diff --git a/website/src/components/search.js b/website/src/components/search.js index f80d9cd9f00..3211b53c002 100644 --- a/website/src/components/search.js +++ b/website/src/components/search.js @@ -1,4 +1,4 @@ -import React, { useEffect, useState } from 'react' +import React from 'react' import PropTypes from 'prop-types' import { DocSearch } from '@docsearch/react' import '@docsearch/css' @@ -6,7 +6,8 @@ import '@docsearch/css' import siteMetadata from '../../meta/site.json' export default function Search({ placeholder = 'Search docs' }) { - const { apiKey, indexName, appId } = siteMetadata.docSearch + const apiKey = process.env.DOCSEARCH_API_KEY + const { indexName, appId } = siteMetadata.docSearch return ( ) diff --git a/website/src/styles/code.module.sass b/website/src/styles/code.module.sass index b619c71ccfd..459281b4322 100644 --- a/website/src/styles/code.module.sass +++ b/website/src/styles/code.module.sass @@ -109,6 +109,8 @@ box-shadow: inset 1px 1px 1px rgba(0, 0, 0, 0.25) background: var(--color-dark) margin: 1.5rem 0 0 2rem + position: sticky + left: 2rem .header width: 100%