Portuguese noun chunks review (#9559)

* added tests * added pt vocab * transferred spanish * added syntax iters * fixed parenthesis * added nmod example * added relative pron * fixed rel pron * added rel subclause * corrected typo * added more NP chains * long sentence * fixed typo * fixed typo * fixed typo * corrected heads * added passive subj * added pass subj * added passive obj * refinement to rights * went back to odl * fixed test * fixed typo * fixed typo * formatted * Format * Format test cases Co-authored-by: Adriane Boyd <[email protected]>
explosion · Nov 4, 2021 · 6e66503 · 6e66503
1 parent 2bf52c4
commit 6e66503
Show file tree

Hide file tree

Showing 4 changed files with 313 additions and 0 deletions.
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
@@ -1,6 +1,7 @@
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
+from .syntax_iterators import SYNTAX_ITERATORS
 from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from ...language import Language, BaseDefaults
 
@@ -10,6 +11,7 @@ class PortugueseDefaults(BaseDefaults):
     infixes = TOKENIZER_INFIXES
     prefixes = TOKENIZER_PREFIXES
     lex_attr_getters = LEX_ATTRS
+    syntax_iterators = SYNTAX_ITERATORS
     stop_words = STOP_WORDS
 
 

diff --git a/spacy/lang/pt/syntax_iterators.py b/spacy/lang/pt/syntax_iterators.py
@@ -0,0 +1,85 @@
+from typing import Union, Iterator, Tuple
+
+from ...symbols import NOUN, PROPN, PRON
+from ...errors import Errors
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = [
+        "nsubj",
+        "nsubj:pass",
+        "obj",
+        "obl",
+        "obl:agent",
+        "nmod",
+        "pcomp",
+        "appos",
+        "ROOT",
+    ]
+    post_modifiers = ["flat", "flat:name", "fixed", "compound"]
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+    np_deps = {doc.vocab.strings.add(label) for label in labels}
+    np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
+    np_label = doc.vocab.strings.add("NP")
+    adj_label = doc.vocab.strings.add("amod")
+    det_label = doc.vocab.strings.add("det")
+    det_pos = doc.vocab.strings.add("DET")
+    adp_label = doc.vocab.strings.add("ADP")
+    conj = doc.vocab.strings.add("conj")
+    conj_pos = doc.vocab.strings.add("CCONJ")
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
+        if word.dep in np_deps:
+            right_childs = list(word.rights)
+            right_child = right_childs[0] if right_childs else None
+
+            if right_child:
+                if (
+                    right_child.dep == adj_label
+                ):  # allow chain of adjectives by expanding to right
+                    right_end = right_child.right_edge
+                elif (
+                    right_child.dep == det_label and right_child.pos == det_pos
+                ):  # cut relative pronouns here
+                    right_end = right_child
+                elif right_child.dep in np_modifs:  # Check if we can expand to right
+                    right_end = word.right_edge
+                else:
+                    right_end = word
+            else:
+                right_end = word
+            prev_end = right_end.i
+
+            left_index = word.left_edge.i
+            left_index = (
+                left_index + 1 if word.left_edge.pos == adp_label else left_index
+            )
+
+            yield left_index, right_end.i + 1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                prev_end = word.i
+
+                left_index = word.left_edge.i  # eliminate left attached conjunction
+                left_index = (
+                    left_index + 1 if word.left_edge.pos == conj_pos else left_index
+                )
+                yield left_index, word.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
@@ -247,6 +247,11 @@ def pt_tokenizer():
     return get_lang_class("pt")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def pt_vocab():
+    return get_lang_class("pt")().vocab
+
+
 @pytest.fixture(scope="session")
 def ro_tokenizer():
     return get_lang_class("ro")().tokenizer

diff --git a/spacy/tests/lang/pt/test_noun_chunks.py b/spacy/tests/lang/pt/test_noun_chunks.py
@@ -0,0 +1,221 @@
+from spacy.tokens import Doc
+import pytest
+
+
+# fmt: off
+@pytest.mark.parametrize(
+    "words,heads,deps,pos,chunk_offsets",
+    [
+        # determiner + noun
+        # um cachorro -> um cachorro
+        (
+            ["um", "cachorro"],
+            [1, 1],
+            ["det", "ROOT"],
+            ["DET", "NOUN"],
+            [(0, 2)],
+        ),
+        # two determiners + noun
+        # meu o pai -> meu o pai
+        (
+            ["meu", "o", "pai"],
+            [2, 2, 2],
+            ["det", "det", "ROOT"],
+            ["DET", "DET", "NOUN"],
+            [(0, 3)],
+        ),
+        # two determiners + noun
+        # todos essos caros -> todos essos caros
+        (
+            ["todos", "essos", "caros"],
+            [2, 2, 2],
+            ["det", "det", "ROOT"],
+            ["DET", "DET", "NOUN"],
+            [(0, 3)],
+        ),
+        # two determiners, one is after noun
+        # um irmão meu -> um irmão meu
+        (
+            ["um", "irmão", "meu"],
+            [1, 1, 1],
+            ["det", "ROOT", "det"],
+            ["DET", "NOUN", "DET"],
+            [(0, 3)],
+        ),
+        # two determiners + noun
+        # o meu pai -> o meu pai
+        (
+            ["o", "meu", "pai"],
+            [2, 2, 2],
+            ["det","det", "ROOT"],
+            ["DET", "DET", "NOUN"],
+            [(0, 3)],
+        ),
+        # relative pronoun
+        # A bicicleta essa está estragada -> A bicicleta
+        (
+            ['A', 'bicicleta', 'essa', 'está', 'estragada'],
+            [1, 4, 1, 4, 4],
+            ['det', 'nsubj', 'det', 'cop', 'ROOT'],
+            ['DET', 'NOUN', 'PRON', 'AUX', 'ADJ'],
+            [(0,2)]
+        ),
+        # relative subclause
+        #  o computador que comprou -> o computador
+        (
+            ['o', 'computador', 'que', 'comprou'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'nsubj', 'acl:relcl'],
+            ['DET', 'NOUN', 'PRON', 'VERB'],
+            [(0, 2), (2, 3)]
+        ),
+        # det + noun + adj
+        # O cachorro marrom  -> O cachorro marrom
+        (
+            ["O", "cachorro", "marrom"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # det + noun + adj plural
+        # As calças baratas  -> As calças baratas
+        (
+            ["As", "calças", "baratas"],
+            [1, 1, 1],
+            ["det", "ROOT", "amod"],
+            ["DET", "NOUN", "ADJ"],
+            [(0, 3)],
+        ),
+        # det + adj + noun
+        # Uma boa ideia -> Uma boa ideia
+        (
+            ['uma', 'boa', 'ideia'],
+            [2, 2, 2],
+            ["det", "amod", "ROOT"],
+            ["DET", "ADJ", "NOUN"],
+            [(0,3)]
+        ),
+        # multiple adjectives
+        # Uma garota esperta e inteligente -> Uma garota esperta e inteligente
+        (
+            ["Uma", "garota", "esperta", "e", "inteligente"],
+            [1, 1, 1, 4, 2],
+            ["det", "ROOT", "amod", "cc", "conj"],
+            ["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
+            [(0,5)]
+        ),
+        # determiner, adjective, compound created by flat
+        # a grande São Paolo -> a grande São Paolo
+        (
+            ["a", "grande", "São", "Paolo"],
+            [2, 2, 2, 2],
+            ["det", "amod", "ROOT", "flat:name"],
+            ["DET", "ADJ", "PROPN", "PROPN"],
+            [(0,4)]
+        ),
+        # one determiner + one noun + one adjective qualified by an adverb
+        # alguns fazendeiros muito ricos -> alguns fazendeiros muito ricos
+        (
+            ['alguns', 'fazendeiros', 'muito', 'ricos'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'advmod', 'amod'],
+            ['DET', 'NOUN', 'ADV', 'ADJ'],
+            [(0,4)]
+        ),
+        # Two NPs conjuncted
+        # Eu tenho um cachorro e um gato -> Eu, um cacharo, um gato
+        ( 
+            ["Eu", "tenho", "um", "cachorro", "e", "um", "gato"],
+            [1, 1, 3, 1, 6, 6, 3],
+            ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
+            ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,1), (2,4), (5,7)]
+         
+        ),
+        # Two NPs together
+        # o escritor brasileiro Aníbal Machado -> o escritor brasileiro, Aníbal Machado
+        (
+            ['o', 'escritor', 'brasileiro', 'Aníbal', 'Machado'],
+            [1, 1, 1, 1, 3],
+            ['det', 'ROOT', 'amod', 'appos', 'flat:name'],
+            ['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
+            [(0, 3), (3, 5)]
+        ),
+        # Noun compound, person name and titles
+        # Dom Pedro II -> Dom Pedro II
+        (
+            ["Dom", "Pedro", "II"],
+            [0, 0, 0],
+            ["ROOT", "flat:name", "flat:name"],
+            ["PROPN", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # Noun compound created by flat
+        # os Estados Unidos -> os Estados Unidos
+        (
+            ["os", "Estados", "Unidos"],
+            [1, 1, 1],
+            ["det", "ROOT", "flat:name"],
+            ["DET", "PROPN", "PROPN"],
+            [(0,3)]
+        ),
+        # nmod relation between NPs
+        # a destruição da cidade -> a destruição, cidade
+        (
+            ['a', 'destruição', 'da', 'cidade'],
+            [1, 1, 3, 1],
+            ['det', 'ROOT', 'case', 'nmod'],
+            ['DET', 'NOUN', 'ADP', 'NOUN'],
+            [(0,2), (3,4)]
+        ),
+        # Compounding by nmod, several NPs chained together
+        # a primeira fábrica de medicamentos do governo -> a primeira fábrica, medicamentos, governo
+        (
+            ["a", "primeira", "fábrica", "de", "medicamentos",  "do", "governo"],
+            [2, 2, 2, 4, 2, 6, 2],
+            ['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
+            [(0, 3), (4, 5), (6, 7)]
+        ),
+        # several NPs
+        # Tradução da reportagem de Susana -> Tradução, reportagem, Susana
+        (
+            ['Tradução', 'da', 'reportagem', 'de', 'Susana'],
+            [0, 2, 0, 4, 2],
+            ['ROOT', 'case', 'nmod', 'case', 'nmod'],
+            ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
+            [(0,1), (2,3), (4,5)]  
+       
+        ),
+        # Several NPs
+        # O gato gordo da Susana e seu amigo -> O gato gordo, Susana, seu amigo
+        (  
+            ['O', 'gato', 'gordo', 'da', 'Susana', 'e', 'seu', 'amigo'],
+            [1, 1, 1, 4, 1, 7, 7, 1],
+            ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'],
+            ['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
+            [(0,3), (4,5), (6,8)]
+        ),
+        # Passive subject
+        # Os novos gastos são alimentados pela grande conta bancária de Clinton -> Os novos gastos, grande conta bancária, Clinton
+        (
+            ['Os', 'novos', 'gastos', 'são', 'alimentados', 'pela', 'grande', 'conta', 'bancária', 'de', 'Clinton'],
+            [2, 2, 4, 4, 4, 7, 7, 4, 7, 10, 7],
+            ['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
+            ['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
+            [(0, 3), (6, 9), (10, 11)]
+        )
+    ],
+)
+# fmt: on
+def test_pt_noun_chunks(pt_vocab, words, heads, deps, pos, chunk_offsets):
+    doc = Doc(pt_vocab, words=words, heads=heads, deps=deps, pos=pos)
+    assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
+
+
+def test_noun_chunks_is_parsed_pt(pt_tokenizer):
+    """Test that noun_chunks raises Value Error for 'pt' language if Doc is not parsed."""
+    doc = pt_tokenizer("en Oxford este verano")
+    with pytest.raises(ValueError):
+        list(doc.noun_chunks)