-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspacy_0.txt
1351 lines (897 loc) · 32.9 KB
/
spacy_0.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# SpaCy tutorial
# https://course.spacy.io/en/chapter1
# https://course.spacy.io/en/chapter2
# https://course.spacy.io/en/chapter3
# https://course.spacy.io/en/chapter4
-----------------
from spacy.lang.en import English
#-- nlp object
nlp = English()
#-- doc object
doc = nlp("Hello world")
#-- iterate over tokens in doc
for token in doc:
print(token.text)
#-- token object
token = doc[1]
#-- span is slice
span = doc[1:3]
print(span.text)
token.is_alpha
token.is_punct
token.like_num
# Import the Spanish language class
from spacy.lang.es import Spanish
# Create the nlp object
nlp = Spanish()
# Process a text (this is Spanish for: "How are you?")
doc = nlp("¿Cómo estás?")
# Print the document text
print(doc.text)
#------ documents, spans and tokens
# Import the English language class and create the nlp object
from spacy.lang.en import English
nlp = English()
# Process the text
doc = nlp("I like tree kangaroos and narwhals.")
# Select the first token
first_token = doc[0]
# Print the first token's text
print(first_token.text)
#------- slices
doc = nlp("I like tree kangaroos and narwhals.")
# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)
# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)
#---------- lexical attributes
lexical attributes to find percentages in a text.
doc = nlp(
"In 1990, more than 60% of people in East Asia were in extreme poverty. "
"Now less than 4% are."
)
# Iterate over the tokens in the doc
for token in doc:
# Check if the token resembles a number
if token.like_num:
# Get the next token in the document
next_token = token.doc[token.i +1]
# Check if the next token's text equals "%"
if next_token.text == "%":
print("Percentage found:", token.text)
#------------ statistics models / predictions / POS tags
python -m spacy downloadn en_core_web_sm
import spacy
nlp = spacy.load("en_core_web_sm")
----
doc = nlp("she ate the pizza")
for token in doc:
print(token.text, token.pos_ , token.dep_ , token.head.text)
------
doc= nlp("The Dominion will deliver order to the galaxy")
for ent in doc.ent:
print(ent.text, ent.label_)
spacy.explain("GPE")
#--- predicting linguistic annotations
nlp = spacy.load("en_core_web_sm")
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"
# Process the text
doc = nlp(text)
for token in doc:
# Get the token text, part-of-speech tag and dependency label
token_text = token.text
token_pos = token.pos_
token_dep = token.dep_
# This is for formatting only
print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")
---
# Process the text
doc = nlp(text)
# Iterate over the predicted entities
for ent in doc.ents:
# Print the entity text and its label
print(ent.text, ent.label_)
#---- predicting named entities
nlp = spacy.load("en_core_web_sm")
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"
# Process the text
doc = nlp(text)
# Iterate over the entities
for ent in doc.ents:
# Print the entity text and label
print(ent.text, ent.label_)
# Get the span for "iPhone X"
iphone_x = doc[1:3]
# Print the span text
print("Missing entity:", iphone_x.text)
#----- rule based entities (make rules)
// match patterns, list of dicts
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"TEXT":"iPhone"}, {'TEXT':'X'}]
matcher.add("IPHONE_PATTEN", None, pattern)
doc = nlp("Upcoming iPhone X is released on May 17")
matches = matcher(doc)
for match_id, start, end in matches:
matched_span = doc[start, end]
print(matched_span.text)
{'OP':'!'} 0 times
{'OP':'?'} optional 0 or 1 time
{'OP':'+'} 1+ time
{'OP':'*'} 0 or more
---------------- patterns
# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
print("Match found:", doc[start:end].text)
---
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
print("Match found:", doc[start:end].text)
# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]
# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))
# Iterate over the matches and print the span text
for match_id, start, end in matches:
print("Match found:", doc[start:end].text)
=========================== data structures
#- vocab
coffe_hash = nlp.vocab.strings['coffee']
coffee_string = nlp.strings[coffe_hash]
doc = nlp('i love coffee')
nlp.vocab.strings('coffee')
lexeme.text
lexeme.orth # the hash
lexeme.is_alpha
---
nlp = English()
doc = nlp("I have a cat")
# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings['cat']
print(cat_hash)
# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)
----
doc = nlp("David Bowie is a PERSON")
# Look up the hash for the string label "PERSON"
person_hash = nlp.vocab.strings["PERSON"]
print(person_hash)
# Look up the person_hash to get the string
person_string = nlp.vocab.strings[person_hash]
print(person_string)
----------------------- doc, span
from spacy.tokens import Doc
words = ['hi','hello','world']
spaces = [True, False, False]
doc = Doc(nlp.vocab, words= words spaces= spaces)
span = Span(doc, 0, 2) # slice of doc
span_with_label = Span(doc, 0,2, label='Greetings')
doc.ents = [span_with_label]
----
from spacy.tokens import Doc
words = ["spaCy", "is", "cool", "!"]
spaces = [True, True, False, False]
# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)
----
# Desired text: "Go, get started!"
words = ["Go", ",", "get", "started", "!"]
spaces = [False,True, True ,False ,False]
# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)
----
words = ['Oh',',','really','?','!']
spaces = [False, True, False, False, False]
# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)
-----------
from spacy.tokens import Doc, Span
words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]
# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=words,spaces=spaces)
print(doc.text)
# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc, 2,4, label='PERSON')
print(span.text, span.label_)
# Add the span to the doc's entities
doc.ents = [span]
# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])
--------
nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")
# Iterate over the tokens
for token in doc:
# Check if the current token is a proper noun
if token.pos_ == "PROPN":
# Check if the next token is a verb
if doc[token.i + 1].pos_ == "VERB":
print("Found proper noun before a verb:", token.text)
---------------- word vectors
Doc.similarity()
Span.similarity()
Token.similarity()
nlp = spacy.load("en_core_web_sm")
doc1 = nlp('i like veggies')
doc2 = nlp('I like veggie pizza')
doc1.similarity(doc2)
----------
# Comparespan and doc
span = nlp('string')
doc = nlp('cheese strings')
span.similarity(doc)
----------
# Compare two tokens
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))
-------
# Load a larger model with vectors
nlp = spacy.load("en_core_web_md")
doc = nlp("I have a banana")
# Access the vector via the token.vector attribute
print(doc[3].vector)
--------
nlp = spacy.load("en_core_web_md")
# Process a text
doc = nlp("Two bananas in pyjamas")
# Get the vector for the token "bananas"
bananas_vector = doc[1].vector
print(bananas_vector)
--------------------------- comparing similarity
import spacy
nlp = spacy.load("en_core_web_md")
doc1 = nlp("It's a warm summer day")
doc2 = nlp("It's sunny outside")
# Get the similarity of doc1 and doc2
similarity = doc1.similarity(doc2)
print(similarity)
-----
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("TV and books")
token1, token2 = doc[0], doc[2]
# Get the similarity of the tokens "TV" and "books"
similarity = token1.similarity(token2)
print(similarity)
-------
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")
# Create spans for "great restaurant" and "really nice bar"
span1 = doc[3:5]
span2 = doc[12:15]
# Get the similarity of the spans
similarity = span1.similarity(span2)
print(similarity)
-------
--------------- combining models & rules
## statistical predictions
use case:
product names,
person names,
subject relations
rule-based systems = dicts with finite examples,
like dog breeds,etc
------------------------- rule based matching
# Initialize with the shared vocab
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
# Patterns are lists of dictionaries describing the tokens
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER": "cats"}]
matcher.add("LOVE_CATS", None, pattern)
# Operators can specify how often a token should be matched
pattern = [{"TEXT": "very", "OP": "+"}, {"TEXT": "happy"}]
matcher.add("VERY_HAPPY", None, pattern)
# Calling matcher on doc returns list of (match_id, start, end) tuples
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)
--------------------- debugging patterns
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
doc = nlp(
"Twitch Prime, the perks program for Amazon Prime members offering free "
"loot, games and other benefits, is ditching one of its best features: "
"ad-free viewing. According to an email sent out to Amazon Prime members "
"today, ad-free viewing will no longer be included as a part of Twitch "
"Prime for new members, beginning on September 14. However, members with "
"existing annual subscriptions will be able to continue to enjoy ad-free "
"viewing until their subscription comes up for renewal. Those with "
"monthly subscriptions will have access to ad-free viewing until October 15."
)
# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"}, {"TEXT": "-"}, {"LOWER": "free"}, {"POS": "NOUN"}]
# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)
# Iterate over the matches
for match_id, start, end in matcher(doc):
# Print pattern string name and text of matched span
print(doc.vocab.strings[match_id], doc[start:end].text)
-------------
------------ efficient phrase matching
match exact strings instead of patterns
for finite lists like countries
import json
from spacy.lang.en import English
with open("exercises/en/countries.json", encoding="utf8") as f:
COUNTRIES = json.loads(f.read())
nlp = English()
doc = nlp("Czech Republic may help Slovakia protect its airspace")
# Import the PhraseMatcher and initialize it
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)
# Call the matcher on the test document and print the result
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])
-----------
------ extracting countries and relationships
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
import json
with open("exercises/en/countries.json", encoding="utf8") as f:
COUNTRIES = json.loads(f.read())
with open("exercises/en/country_text.txt", encoding="utf8") as f:
TEXT = f.read()
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)
# Create a doc and reset existing entities
doc = nlp(TEXT)
doc.ents = []
# Iterate over the matches
for match_id, start, end in matcher(doc):
# Create a Span with the label for "GPE"
span = Span(doc, start, end, label="GPE")
# Overwrite the doc.ents and add the span
doc.ents = list(doc.ents) + [span]
# Get the span's root head token
span_root_head = span.root.head
# Print the text of the span root's head token and the span text
print(span_root_head.text, "-->", span.text)
# Print the entities in the document
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])
---------
--------------- chapter 3 --- processing pipelines
-- inspecting the pipeline
import spacy
# Load the en_core_web_sm model
nlp = spacy.load("en_core_web_sm")
# Print the names of the pipeline components
print(nlp.pipe_names)
# Print the full pipeline of (name, component) tuples
print(nlp.pipeline)
----------- custom pipelines
--- simple component
import spacy
# Define the custom component
def length_component(doc):
# Get the doc's length
doc_length = len(doc)
print(f"This document is {doc_length} tokens long.")
# Return the doc
return doc
# Load the small English model
nlp = spacy.load("en_core_web_sm")
# Add the component first in the pipeline and print the pipe names
nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)
# Process a text
doc = nlp("This is a sentence.")
-------
---------- complex component
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)
# Define the custom component
def animal_component(doc):
# Apply the matcher to the doc
matches = matcher(doc)
# Create a Span for each match and assign the label "ANIMAL"
spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
# Overwrite the doc.ents with the matched spans
doc.ents = spans
return doc
# Add the component to the pipeline after the "ner" component
nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)
# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])
-------- setting custome attributes
Add custom metadata to documents, tokens and spans
Accessible via the ._ property
doc._.title = "My document"
token._.is_color = True
span._.has_color = False
-----------
# Import global classes
from spacy.tokens import Doc, Token, Span
# Set extensions on the Doc, Token and Span
Doc.set_extension("title", default=None)
Token.set_extension("is_color", default=False)
Span.set_extension("has_color", default=False)
Extension attribute types -
Attribute extensions
Property extensions
Method extensions
-- Overwrite extensions
from spacy.tokens import Token
# Set extension on the Token with default value
Token.set_extension("is_color", default=False)
doc = nlp("The sky is blue.")
# Overwrite extension attribute value
doc[3]._.is_color = True
------- define a getter and setter function
from spacy.tokens import Token
# Define getter function
def get_is_color(token):
colors = ["red", "yellow", "blue"]
return token.text in colors
# Set extension on the Token with getter
Token.set_extension("is_color", getter=get_is_color)
doc = nlp("The sky is blue.")
print(doc[3]._.is_color, "-", doc[3].text)
-------
from spacy.tokens import Span
# Define getter function
def get_has_color(span):
colors = ["red", "yellow", "blue"]
return any(token.text in colors for token in span)
# Set extension on the Span with getter
Span.set_extension("has_color", getter=get_has_color)
doc = nlp("The sky is blue.")
print(doc[1:4]._.has_color, "-", doc[1:4].text)
print(doc[0:2]._.has_color, "-", doc[0:2].text)
---- method extensions
from spacy.tokens import Doc
# Define method with arguments
def has_token(doc, token_text):
in_doc = token_text in [token.text for token in doc]
return in_doc
# Set extension on the Doc with method
Doc.set_extension("has_token", method=has_token)
doc = nlp("The sky is blue.")
print(doc._.has_token("blue"), "- blue")
print(doc._.has_token("cloud"), "- cloud")
---------------------------
from spacy.lang.en import English
from spacy.tokens import Token
nlp = English()
# Register the Token extension attribute "is_country" with the default value False
Token.set_extension("is_country", default=False)
# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True
# Print the token text and the is_country attribute for all tokens
print([(token.text, token._.is_country) for token in doc])
---------------
------------------ reverse text tokens
from spacy.lang.en import English
from spacy.tokens import Token
nlp = English()
# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
return token.text[::-1]
# Register the Token property extension "reversed" with the getter get_reversed
Token.set_extension("reversed", getter=get_reversed)
# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
print("reversed:", token._.reversed)
--------------
---------------- html text
from spacy.lang.en import English
from spacy.tokens import Span
nlp = English()
# Define the method
def to_html(span, tag):
# Wrap the span text in a HTML tag and return it
return f"<{tag}>{span.text}</{tag}>"
# Register the Span method extension "to_html" with the method to_html
Span.set_extension("to_html", method=to_html)
# Process the text and call the to_html method on the span with the tag name "strong"
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html("strong"))
-------------
###########################
combine custom extension attributes with the model’s predictions
and create an attribute getter that returns a Wikipedia search
URL if the span is a person, organization, or location.
###########################
import spacy
from spacy.tokens import Span
nlp = spacy.load("en_core_web_sm")
def get_wikipedia_url(span):
# Get a Wikipedia URL if the span has one of the labels
if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
entity_text = span.text.replace(" ", "_")
return "https://en.wikipedia.org/w/index.php?search=" + entity_text
# Set the Span extension wikipedia_url using the getter get_wikipedia_url
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)
doc = nlp(
"In over fifty years from his very first recordings right through to his "
"last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
# Print the text and Wikipedia URL of the entity
print(ent.text, ent._.wikipedia_url)
-------------------------
--------- phrase match country to capitals
import json
from spacy.lang.en import English
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
with open("exercises/en/countries.json", encoding="utf8") as f:
COUNTRIES = json.loads(f.read())
with open("exercises/en/capitals.json", encoding="utf8") as f:
CAPITALS = json.loads(f.read())
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))
def countries_component(doc):
# Create an entity Span with the label "GPE" for all matches
matches = matcher(doc)
doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matches]
return doc
# Add the component to the pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)
# Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: CAPITALS.get(span.text)
# Register the Span extension attribute "capital" with the getter get_capital
Span.set_extension("capital", getter=get_capital)
# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])
-----------------
------------------- scaling and performance
processing large volumes of text
use nlp.pipe()
docs = list(nlp.pipe(LOTS_OF_TEXTS))
------------ passing in context
data = [
("This is a text", {"id": 1, "page_number": 15}),
("And another text", {"id": 2, "page_number": 16}),
]
for doc, context in nlp.pipe(data, as_tuples=True):
print(doc.text, context["page_number"])
------------
from spacy.tokens import Doc
Doc.set_extension("id", default=None)
Doc.set_extension("page_number", default=None)
data = [
("This is a text", {"id": 1, "page_number": 15}),
("And another text", {"id": 2, "page_number": 16}),
]
for doc, context in nlp.pipe(data, as_tuples=True):
doc._.id = context["id"]
doc._.page_number = context["page_number"]
--------------
-------------- using only the tokenizer
doc = nlp.make_doc("Hello world")
----------- disable pipeline component
# Disable tagger and parser
with nlp.disable_pipes("tagger", "parser"):
# Process the text and print the entities
doc = nlp(text)
print(doc.ents)
---------------------
-------- processing streams
import json
import spacy
nlp = spacy.load("en_core_web_sm")
with open("exercises/en/tweets.json", encoding="utf8") as f:
TEXTS = json.loads(f.read())
# Process the texts and print the adjectives
for doc in nlp.pipe(TEXTS):
print([token.text for token in doc if token.pos_ == "ADJ"])
---------
import json
import spacy
nlp = spacy.load("en_core_web_sm")
with open("exercises/en/tweets.json", encoding="utf8") as f:
TEXTS = json.loads(f.read())
# Process the texts and print the entities
docs = list(nlp.pipe(TEXTS))
entities = [doc.ents for doc in docs]
print(*entities)
----------
from spacy.lang.en import English
nlp = English()
people = ["David Bowie", "Angela Merkel", "Lady Gaga"]
# Create a list of patterns for the PhraseMatcher
patterns = list(nlp.pipe(people))
-----------