-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path__main__.py
159 lines (126 loc) · 8.21 KB
/
__main__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import argparse
import logging
from sentence_transformers import SentenceTransformer
import json
import numpy as np
import os
import NameEntityRecognition.NER_spacy as spacyobject
import NameEntityRecognition.NER_stanza as stanzaobject
import NameEntityRecognition.NER_statistiques as statistiques
import enonce_de_la_question
import TopicRecognition.topic_recognition as recognition
import TopicRecognition.topic_par_soussection as par_ssection
import TopicRecognition.questions_par_topic as par_topic
from datetime import date
parser = argparse.ArgumentParser()
def load_cli_parameters():
subparsers = parser.add_subparsers(title="commands")
# NER command
ner_parser = subparsers.add_parser('ner', help="Create a file with 4 columns data_file+'NER_'+algo+'_results': \n"
"- 'text' : text of the data file \n -'size' : its number of tokens \n"
"- 'ner' : the list of the name entities recognized by the algorithm in the text \n"
"- 'description' : description of each name entity")
ner_parser.add_argument('algo', type=str, metavar='algo',choices=['spacy','stanza'], help="Chose the algorithm spacy or stanza to do the name entity recognition."
" Stanza takes longer than Spacy but it is more efficient to recognize names.")
ner_parser.add_argument('data_file', type=str, help="The file containing the data. The file should contain a column 'contenu' with the text, "
", a column 'intervenant_nom' and a column 'intervenant_fonction' with respectively the name of the speaker and its fonction.")
ner_parser.set_defaults(func=ner)
# frequencies NER command
frequencies_ner_parser = subparsers.add_parser('frequencies_ner', help="From the results of the NER, create a file with 4 columns data_file+algo+'_statistiques': \n"
"- 'last' : last token of the NE that we count \n -'longest' : longest NE contaning this last token \n"
"- 'personnes' : all the NE having this last token \n"
"- 'count' : sum of number of occurrences of the NE in personnes")
frequencies_ner_parser.add_argument('algo', type=str, metavar='algo',choices=['spacy','stanza'], help="The algorithm spacy or stanza used to do the name entity recognition")
frequencies_ner_parser.add_argument('data_file', type=str, help="The file containing the data. The file should contain a column 'contenu' with the text, "
", a column 'intervenant_nom' and a column 'intervenant_fonction' with respectively the name of the speaker and its fonction.")
frequencies_ner_parser.set_defaults(func=frequencies_ner)
# enunciates of the questions command
enunciate_parser = subparsers.add_parser('enunciate', help="Create a file with the list of the 'sous section' and the enunciate of the question for each 'sous section'")
enunciate_parser.add_argument('data_file', type=str, help="The file containing the data. The file should contain a column 'contenu' with the text, "
", a column 'intervenant_nom' and a column 'intervenant_fonction' with respectively the name of the speaker and its fonction.")
enunciate_parser.add_argument('name_template', type=str, help="Name used for the name template of every output files concerning this data file")
enunciate_parser.set_defaults(func=enunciate)
#topic recognition results and visualisation command
topics_parser = subparsers.add_parser('topics_recognition', help="Create a file with for each line of the data file its associated topic,"
" a file with the list of sous section associated to a topic, and a visualisation of the topics")
topics_parser.add_argument('data_file', type=str, help="The file containing the data.")
topics_parser.add_argument('--first_time', action='store_false', help="By default, at True, it means it is the first time using the dataset\n"
"False : the dataset has already been used with the command topics_recognition, gain of time")
topics_parser.add_argument('name_template', type=str, help="Name used for the name template of every output files concerning this data file ")
topics_parser.add_argument('--questions_only', action='store_true', help="By default at False, print all the points in the visualisation \n"
"True : the visualisaton only print the points corresponding to the enunciate of the questions")
#si le user ne met pas --questions_only : sa valeur est à False et s'il met --questions-only: sa valeur est à True
# idem pour use_labels
topics_parser.add_argument('--use_labels',action='store_true', help="By default at False, do not use any label for the topics \n"
"True : Use the labels defined for the document nosdeputes to qualify the topics")
topics_parser.set_defaults(func=topics)
# topics for each question command
topics_question_parser=subparsers.add_parser('topics_question', help="From the results of the command topics_recognition, create a file with for each sous section , its associated topics")
topics_question_parser.add_argument('name_template', type=str, help="Name used for the name template of every output files concerning the data used with the command topics_recognition")
topics_question_parser.set_defaults(func=topics_question)
cli_parameters = vars(parser.parse_args())
return cli_parameters
def ner(algo, data_file) :
if algo=="spacy" :
spacyobject.ner(data_file)
if algo=="stanza" :
stanzaobject.ner(data_file)
def frequencies_ner(algo,data_file):
statistiques.stats(algo,data_file)
def enunciate(data_file,name_template):
enonce_de_la_question.enunciate(data_file,name_template)
def topics(data_file, first_time,name_template,questions_only, use_labels):
titles=[]
phrases_completes=[]
if first_time :
results=recognition.get_docs(data_file,name_template)
docs = results[0]
titles= results[1]
phrases_completes=results[2]
else :
results=recognition.get_docs_filtres(name_template)
docs = results[0]
phrases_completes=results[2]
embedding_model = SentenceTransformer("dangvantuan/sentence-camembert-large")
embeddings=[]
print(len(docs))
print(first_time)
if not first_time :
embeddings = embedding_model.encode(docs, show_progress_bar=True)
print(len(embeddings))
print(len(embeddings.tolist()))
with open(name_template+"_embeddings.json","w") as f :
json.dump(embeddings.tolist(), f)
else :
with open(name_template+"_embeddings.json") as f :
embeddings = np.array(json.load(f))
topic_model = recognition.set_model_parameters(embedding_model)
topic_model.fit(docs, embeddings)
print(topic_model.generate_topic_labels(nr_words=5, topic_prefix=True, word_length=None, separator=' -- '))
print(topic_model.get_topic_info())
labels=None
if use_labels:
labels=recognition.labels
par_topic.questions_par_topic(phrases_completes,labels,topic_model.topics_,name_template,use_labels)
else :
par_topic.questions_par_topic(phrases_completes,topic_model.get_topic_info()["Name"],topic_model.topics_,name_template,use_labels)
recognition.store_results(name_template,topic_model,docs,labels)
if questions_only:
recognition.visualisation(True,False, phrases_completes,None, name_template, topic_model,embeddings,labels)
else :
recognition.visualisation(False,False, None,titles, name_template, topic_model,embeddings,labels)
def topics_question(name_template):
par_ssection.classification(name_template)
def main():
cli_params = load_cli_parameters()
# if cli_params["silent"]:
# logger.setLevel(level=logging.ERROR)
if "func" in cli_params:
#config_path = cli_params.pop("config_path", None)
#check_custom_config(config_path)
command = cli_params.pop("func")
command(**cli_params)
else:
parser.print_help()
if __name__ == '__main__':
main()