2
2
from multiprocessing import Process , Queue , Event , cpu_count
3
3
from pathlib import Path
4
4
import queue
5
+ import re
5
6
import sys
6
7
import traceback
7
8
from typing import Generator , Iterable , Optional , List
10
11
from .analyzer import Analyzer
11
12
12
13
MINI_BATCH_SIZE = 100
14
+ GINZA_MODEL_PATTERN = re .compile (r"^(ja_ginza|ja_ginza_electra)$" )
15
+ SPACY_MODEL_PATTERN = re .compile (r"^[a-z]{2}[-_].+[-_].+(sm|md|lg|trf)$" )
13
16
14
17
15
18
class _OutputWrapper :
@@ -61,7 +64,6 @@ def run(
61
64
parallel_level : int = 1 ,
62
65
files : List [str ] = None ,
63
66
):
64
- assert model_path is None or ensure_model is None
65
67
if output_format in ["3" , "json" ] and hash_comment != "analyze" :
66
68
print (
67
69
f'hash_comment="{ hash_comment } " not permitted for JSON output. Forced to use hash_comment="analyze".' ,
@@ -86,9 +88,31 @@ def run(
86
88
print ("GPU enabled" , file = sys .stderr )
87
89
parallel_level = level
88
90
91
+ assert model_path is None or ensure_model is None
92
+ if ensure_model :
93
+ ensure_model = ensure_model .replace ("-" , "_" )
94
+ try :
95
+ from importlib import import_module
96
+ import_module (ensure_model )
97
+ except ModuleNotFoundError :
98
+ if GINZA_MODEL_PATTERN .match (ensure_model ):
99
+ print ("Installing" , ensure_model , file = sys .stderr )
100
+ import pip
101
+ pip .main (["install" , ensure_model ])
102
+ print ("Successfully installed" , ensure_model , file = sys .stderr )
103
+ elif SPACY_MODEL_PATTERN .match (ensure_model ):
104
+ print ("Installing" , ensure_model , file = sys .stderr )
105
+ from spacy .cli .download import download
106
+ download (ensure_model )
107
+ print ("Successfully installed" , ensure_model , file = sys .stderr )
108
+ else :
109
+ raise OSError ("E050" , f'You need to install "{ ensure_model } " before executing ginza.' )
110
+ model_name_or_path = ensure_model
111
+ else :
112
+ model_name_or_path = model_path
113
+
89
114
analyzer = Analyzer (
90
- model_path ,
91
- ensure_model ,
115
+ model_name_or_path ,
92
116
split_mode ,
93
117
hash_comment ,
94
118
output_format ,
@@ -288,7 +312,7 @@ def main_ginzame():
288
312
289
313
@plac .annotations (
290
314
model_path = ("model directory path" , "option" , "b" , str ),
291
- ensure_model = ("select model either ja_ginza or ja_ginza_electra " , "option" , "m" , str , [ "ja_ginza" , "ja-ginza" , "ja_ginza_electra" , "ja-ginza-electra" , None ] ),
315
+ ensure_model = ("select model package of ginza or spacy " , "option" , "m" , str ),
292
316
split_mode = ("split mode" , "option" , "s" , str , ["A" , "B" , "C" ]),
293
317
hash_comment = ("hash comment" , "option" , "c" , str , ["print" , "skip" , "analyze" ]),
294
318
output_path = ("output path" , "option" , "o" , Path ),
0 commit comments