diff --git a/.github/workflows/nlp-question-generator.yml b/.github/workflows/nlp-question-generator.yml new file mode 100644 index 00000000..c2a72a61 --- /dev/null +++ b/.github/workflows/nlp-question-generator.yml @@ -0,0 +1,54 @@ +name: Question Generator + +on: + push: + branches: + - main + - v*-branch + paths: + - tasks/nlp-question-generator/** + - tests/datasets.py + - tests/server.py + - tests/test_nlp_question_generator.py + + pull_request: + branches: + - main + - v*-branch + paths: + - tasks/nlp-question-generator/** + - tests/datasets.py + - tests/server.py + - tests/test_nlp_question_generator.py + +jobs: + build: + runs-on: ubuntu-latest + + # runs all of the steps inside the specified container rather than on the VM host. + # Because of this the network configuration changes from host based network to a container network. + container: + image: platiagro/platiagro-notebook-image:0.3.0 + + services: + + minio: + image: bitnami/minio:latest + env: + MINIO_ACCESS_KEY: minio + MINIO_SECRET_KEY: minio123 + ports: + - 9000:9000 + + steps: + - uses: actions/checkout@v2 + + - name: Test with pytest + run: | + pip install pytest + pytest -v tests/test_nlp_question_generator.py + timeout-minutes: 90 + env: + MINIO_ENDPOINT: minio:9000 + MINIO_ACCESS_KEY: minio + MINIO_SECRET_KEY: minio123 diff --git a/README.md b/README.md index 62316b55..eb3559ef 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Task | Status | License [Sparse Document Retriever](tasks/nlp-sparse-document-retriever/) | [![Sparse Document Retriever](https://github.com/platiagro/tasks/workflows/Sparse%20Document%20Retriever/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/nlp-sparse-document-retriever.yml) | TBD [Dense Document Retriever](tasks/nlp-dense-document-retriever/) | [![Dense Document Retriever](https://github.com/platiagro/tasks/workflows/Dense%20Document%20Retriever/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/nlp-dense-document-retriever.yml) | TBD [Document Reader](tasks/nlp-document-reader/) | [![Document Reader](https://github.com/platiagro/tasks/workflows/Document%20Reader/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/nlp-document-reader.yml) | TBD +[Question Generator](tasks/nlp-question-generator/) | [![Question Generator](https://github.com/platiagro/tasks/workflows/Question%Generator/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/nlp-question-generator.yml) | TBD [Normalizer](tasks/normalizer/) | [![Normalizer](https://github.com/platiagro/tasks/workflows/Normalizer/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/normalizer.yml) | [BSD 3-Clause](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING) [Pre Selection](tasks/pre-selection/) | [![Pre Selection](https://github.com/platiagro/tasks/workflows/Pre%20Selection/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/pre-selection.yml) | [BSD 3-Clause](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING) [Random Forest Classifier](tasks/random-forest-classifier/) | [![Random Forest Classifier](https://github.com/platiagro/tasks/workflows/Random%20Forest%20Classifier/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/random-forest-classifier.yml) | [BSD 3-Clause](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING) diff --git a/tasks/nlp-question-generator/Deployment.ipynb b/tasks/nlp-question-generator/Deployment.ipynb new file mode 100644 index 00000000..fba77312 --- /dev/null +++ b/tasks/nlp-question-generator/Deployment.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Question Generator - Implantação\n", + "\n", + "Utiliza um transformer T5 pré treinado em português e disponibilizado pelo [huggingfaces](https://platiagro.github.io/tutorials/).
\n", + "\n", + "### **Em caso de dúvidas, consulte os [tutoriais da PlatIAgro](https://platiagro.github.io/tutorials/).**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Declaração de Classe para Predições em Tempo Real\n", + "\n", + "A tarefa de implantação cria um serviço REST para predições em tempo-real.
\n", + "Para isso você deve criar uma classe `Model` que implementa o método `predict`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting Model.py\n" + ] + } + ], + "source": [ + "%%writefile Model.py\n", + "import joblib\n", + "import pandas as pd\n", + "import numpy as np\n", + "from typing import List\n", + "from expander import DocExpander\n", + "from aux_functions import build_df_result\n", + "\n", + "\n", + "class Model:\n", + " \n", + " def __init__(self):\n", + " self.loaded = False\n", + " \n", + " \n", + " def load(self):\n", + " \n", + " artifacts = joblib.load(\"/tmp/data/qgenerator.joblib\")\n", + " self.model = artifacts[\"model\"]\n", + " self.expand_context = artifacts[\"expand_context\"]\n", + " self.infer_num_gen_sentences = artifacts[\"infer_num_gen_sentences\"]\n", + " self.column_context = artifacts[\"column_context\"]\n", + " self.column_question = artifacts[\"column_question\"]\n", + " self.loaded = True\n", + "\n", + " def class_names(self) -> List:\n", + " return ['doc_id','context','questions','expanded_context']\n", + " \n", + " def expand(self,df):\n", + " if self.expand_context:\n", + " exp = DocExpander() \n", + " df_final = exp.expand_sql(df,context_column_name=self.column_context,questions_column_name = self.column_question)\n", + " \n", + " return df_final\n", + "\n", + " def predict(self, X, feature_names, meta=None):\n", + " \n", + " if not self.loaded:\n", + " self.load()\n", + " \n", + " feature_names_pipeline = ['doc_id', 'context']\n", + " feature_names_qa = ['context']\n", + " \n", + " if feature_names != feature_names_pipeline and feature_names != feature_names_qa:\n", + " raise ValueError(f'feature_names deve ser {feature_names_pipeline} ou {feature_names_qa}')\n", + " \n", + " \n", + " df_input = pd.DataFrame(X,columns=feature_names)\n", + " contexts = df_input['context'].to_numpy()\n", + " gen_questions_dict = self.model.forward(contexts=contexts, num_gen_sentences=self.infer_num_gen_sentences)\n", + " df_result = build_df_result(gen_questions_dict,column_context=self.column_context,column_question=self.column_question)\n", + " df_result = self.expand(df_result)\n", + " \n", + " if feature_names == feature_names_pipeline:\n", + " df_input = df_input[['doc_id']] \n", + " df_input['index'] = df_input.index\n", + " df_result['index'] = df_result.index\n", + " df_result = pd.merge(df_input, df_result, on='index', how='outer')\n", + " del df_result['index']\n", + " \n", + " return df_result.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# import pandas as pd\n", + "# df = pd.read_csv(\"squad-test-v1.1.csv\")\n", + "# n_lines = 10\n", + "# contexts = df['context'][:n_lines]\n", + "# indexes = df.index[:n_lines]\n", + "\n", + "# df_small = pd.DataFrame({'doc_id':indexes,'context':contexts})\n", + "# X = df_small.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# from Model import Model\n", + "# model = Model()\n", + "# result = model.predict(X,['doc_id','context'])\n", + "# result" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "experiment_id": "dd63cfbd-7a97-41ac-bd9b-fd11711ba459", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + }, + "operator_id": "e4150bc8-88f2-4d98-b68a-6c246270c403", + "task_id": "ccfeb3fe-3d3a-43cf-bdc4-d0b07017e468" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tasks/nlp-question-generator/Experiment.ipynb b/tasks/nlp-question-generator/Experiment.ipynb new file mode 100644 index 00000000..c38db481 --- /dev/null +++ b/tasks/nlp-question-generator/Experiment.ipynb @@ -0,0 +1,540 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Question Generator - Experimento\n", + "\n", + "Utiliza um transformer T5 pré treinado em português e disponibilizado pelo [huggingfaces](https://platiagro.github.io/tutorials/).
\n", + "### **Em caso de dúvidas, consulte os [tutoriais da PlatIAgro](https://platiagro.github.io/tutorials/).**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Declaração de parâmetros e hiperparâmetros\n", + "\n", + "Declare parâmetros com o botão na barra de ferramentas.
\n", + "A variável `dataset` possui o caminho para leitura do arquivos importados na tarefa de \"Upload de dados\".
\n", + "Você também pode importar arquivos com o botão na barra de ferramentas." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "dataset = \"/tmp/data/reports_contexts_small.csv\" #@param {type:\"string\"}\n", + "\n", + "### Dados para Treinamento\n", + "\n", + "# Dataset\n", + "column_context = \"context\" #@param [\"context\"] {type:\"string\",label:\"Coluna contexto\",description:\"Coluna em que estão contidas os contextos\"}\n", + "column_question = \"questions\" #@param {type:\"string\",label:\"Coluna pergunta\",description:\"Coluna em que estão contidas listas de perguntas por células. Apenas considerada se train_from_zero=True\"}\n", + "train_from_zero = False #@param {type:\"boolean\",label:\"Treinamento do algorítimo do zero\",description:\"Caso True utiliza o algorítimo com finne-tunning no squad em português. Caso True retreina do zero\"}\n", + "train_from_squad = False #@param {type:\"boolean\",label:\"Treinamento do algorítimo pelo Squad\",description:\"Caso True utiliza treinará algorítimo com finne-tunning no squad em português. Caso False teinará com o dataset passado\"}\n", + "expand_context = True #@param {type:\"boolean\",label:\"Expansão de contextos\",description:\"Expande o contexto passado com as perguntas geradas separadas por espaço\"}\n", + "\n", + "#prepare_data\n", + "dev_size_from_data= 0.2 #@param {type:\"float\",label:\"Porcentagem para avaliação\",description:\"Parcela dos dados utilizadas para avaliação, sendo o restante utilizado para treino. Apenas considerada se train_from_zero=True e train_from_squad=True\"}\n", + "test_size_from_dev= 0.5 #@param {type:\"float\",label:\"Porcentagem para teste\",description:\"Parcela dos dados utilizadas para avaliação que serã utilizados para teste, sendo o restante utilizado para validação. Apenas considerada se train_from_zero=True\"}\n", + "#batch_dataset_preparation = 30 #@param {type:\"float\",label:\"Batch para preparação dos dados\",description:\"Tamanho do batchque o tokenizador utilizará para preparar o dataset. Apenas considerada se train_from_zero=True\"}\n", + "\n", + "model_name = \"unicamp-dl/ptt5-base-portuguese-vocab\" #@param {type:\"string\",label:\"Modelo\",description:\"Modelo utilizado da base de modelo do hugginfaces\"}\n", + "PREFIX = \"gerador_perguntas:\" #@param {type:\"string\",label:\"Prefixo para o T5\",description:\"Incluindo em cada sentença passada ao transformers. Apenas considerado se train_from_zero=True\"}\n", + "num_gen_sentences = 2 #@param {type:\"integer\",label:\"Número de perguntas geradas experimentação\",description:\"Apenas consideradao se train_from_zero=True\"}\n", + "infer_num_gen_sentences = 10 #@param {type:\"integer\",label:\"Número de perguntas geradas na inferência\"}\n", + "train_batch_size= 2 #@param {type:\"integer\",label:\"Batch size para treino\",description:\"Tamanho do batch de treino. Está associado a num_gen_sentences. Apenas considerado se train_from_zero=True\"}\n", + "eval_batch_size= 8 #@param {type:\"integer\",label:\"Batch size para avaliação\",description:\"Tamanho do batch de validação e teste. Está associado a num_gen_sentences. Apenas considerado se train_from_zero=True\"}\n", + "infer_batch_size = 8 #@param {type:\"integer\",label:\"Batch size para inferência\"}\n", + "no_repeat_ngram_size= 2 #@param {type:\"float\",label:\"Sequência máxima de tokens repetidos\",description:\"Após a repetição de tokens configurada, força a trocar de token na geração do decoder\"}\n", + "temperature= 0.7 #@param {type:\"float\",label:\"Temperatura de randomização do decoder\",description:\"Pode ser entre 0 e 1. Quanto mais próxima de 0, mais próximo da decodificação gulosa (que procura tokens de maior probabilidade), quanto mais próximo de 1 randomiza entre os tokens contidos no top_p\"}\n", + "top_p= 0.92 #@param {type:\"float\",label:\"Porcentagem de consideração\",description:\"Considera apenas os tokens que compoẽ a porcentagem top_op no histograma de probabilidades dos tokens de saída https://huggingface.co/blog/how-to-generate\"}\n", + "source_max_length= 512 #@param {type:\"integer\",label:\"Tamanho do contexto de entrada\",description:\"Tamanho máximo contexto de entrada em tokens\"}\n", + "target_max_length= 100 #@param {type:\"integer\",label:\"Tamanho da sentença gerada\",description:\"Tamanho máximo da pergunta derada em tokens\"}\n", + "learning_rate= 3.0e-5 #@param {type:\"float\",label:\"Taxa de aprendizado\"}\n", + "eps= 1.0e-08 #@param {type:\"integer\",float:\"Valor de estabilidade do otimizador Adam\"}\n", + "seed = 13 #@param {type:\"integer\",label:\"Semente de aleatoriedade\"}\n", + "\n", + "#lightning_params\n", + "num_gpus= 1 #@param {type:\"integer\",label:\"Numero de GPUs\"}\n", + "profiler= True #@param {type:\"integer\",label:\"Resumo\"}\n", + "max_epochs= 1 #@param {type:\"integer\",label:\"Máximo de épocas para treinamento\"}\n", + "accumulate_grad_batches= 16 #@param {type:\"integer\",label:\"Batchs acumulados\",description:\"Batchs acumulados antes de atualizar os pesos\"}\n", + "check_val_every_n_epoch= 1 #@param {type:\"integer\",label:\"Frequência Validação\",description:\"Frequência da chamada da validação em épocas\"}\n", + "progress_bar_refresh_rate= 1 #@param {type:\"integer\",label:\"Frequência de autuazaliação da barra de progresso\"}\n", + "gradient_clip_val= 1.0 #@param {type:\"float\",label:\"Favor de corte dos gradientes\",\"description\":\"O fator evita que os gradientes explodam definindo um limite para os mesmos\"}\n", + "fast_dev_run= False #@param {type:\"boolean\",label:\"Rodar um batch\",description:\"Utilizado para validar que todas as partes estão funcionando antes de treinar o modelo por inteiro\"}\n", + "\n", + "#early_stop_callback\n", + "monitor= 'avg_train_loss' #@param {type:\"integer\",label:\"Frequência de autuazaliação da barra de progresso\"}\n", + "min_delta= 0.01 #@param {type:\"integer\",label:\"Variação mínima entre épocas\"}\n", + "patience= 1 #@param {type:\"integer\",label:\"Epera após atingir variação mínima\"}\n", + "verbose= False #@param {type:\"boolean\",label:\"Disponibilizar informações early stop\"}\n", + "mode= 'min' #@param [\"min\",\"max\"] {type:\"integer\",label:\"Modo de parada\",description: \"Modo de funcionamento para critério de parada\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Leitura do conjunto de dados\n", + "\n", + "O exemplo abaixo faz a leitura de dados tabulares (ex: .csv).
\n", + "Modifique o código de acordo com o tipo de dado que desejar ler." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "if \"context\" not in df.columns:\n", + " raise ValueError(\"A coluna context deve estar obrigatoramente contida no dataset\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download de arquivos necessários" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "GLOVE_WEIGHTS_PATH = \"glove_s300_portugues.txt\"\n", + "SQUAD_TRAIN_PATH = \"squad-train-v1.1.json\"\n", + "SQUAD_DEV_PATH = \"squad-dev-v1.1.json\"\n", + "if not os.path.exists(GLOVE_WEIGHTS_PATH):\n", + " !wget -nc https://storage.googleapis.com/platiagro/Vident/glove_s300_portugues.txt\n", + "if not os.path.exists(SQUAD_TRAIN_PATH):\n", + " !wget -nc https://storage.googleapis.com/platiagro/Vident/squad-train-v1.1.json\n", + "if not os.path.exists(SQUAD_DEV_PATH):\n", + " !wget -nc https://storage.googleapis.com/platiagro/Vident/squad-dev-v1.1.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Divide dataset em subconjuntos de treino, validação e teste\n", + "\n", + "Subconjunto de treino: amostra de dados usada para treinar o modelo.
\n", + "Subconjunto de treino: amostra de dados usada para validar o modelo.
\n", + "Subconjunto de teste: amostra de dados usada para fornecer uma avaliação imparcial do treinamento do modelo no subconjunto de dados de treino." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + " \n", + "if (train_from_zero and not train_from_squad):\n", + " df_train,df_ = train_test_split(df, train_size=dev_size_from_data)\n", + " df_valid, df_test = train_test_split(df_, train_size=test_size_from_dev)\n", + " train_output = 'df_squad_train_bert_chuncked.csv'\n", + " valid_output = 'df_squad_valid_bert_chuncked.csv'\n", + " test_output = 'df_squad_test_bert_chuncked.csv'\n", + " df_train.to_csv(os.path.join(train_output),index=False)\n", + " df_valid.to_csv(os.path.join(valid_output),index=False)\n", + " df_test.to_csv(os.path.join(test_output),index=False)\n", + "else:\n", + " df_test = df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configurando Argumentos" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Mapeando dirpaths\n", + "data_dir = root_dir = os.getcwd()\n", + "logs_dir = os.path.join(root_dir)\n", + "# Colocando parâmetros de entrada no fromato esperado\n", + "hparams = {\n", + " \"model_name\":model_name,\n", + " \"PREFIX\":PREFIX,\n", + " \"num_gen_sentences\":num_gen_sentences,\n", + " \"infer_num_gen_sentences\":infer_num_gen_sentences,\n", + " \"no_repeat_ngram_size\":no_repeat_ngram_size,\n", + " \"temperature\":temperature,\n", + " \"top_p\":top_p,\n", + " \"train_batch_size\":train_batch_size,\n", + " \"eval_batch_size\":eval_batch_size,\n", + " \"infer_batch_size\":infer_batch_size,\n", + " \"source_max_length\":source_max_length,\n", + " \"target_max_length\":target_max_length,\n", + " \"learning_rate\":learning_rate,\n", + " \"eps\":eps,\n", + " \"seed\":seed,\n", + "}\n", + "\n", + "lightning_params = {\n", + " \"num_gpus\":num_gpus,\n", + " \"profiler\":profiler,\n", + " \"max_epochs\":max_epochs,\n", + " \"accumulate_grad_batches\":accumulate_grad_batches,\n", + " \"check_val_every_n_epoch\":check_val_every_n_epoch,\n", + " \"progress_bar_refresh_rate\":progress_bar_refresh_rate,\n", + " \"gradient_clip_val\":gradient_clip_val,\n", + " \"fast_dev_run\":fast_dev_run,\n", + "}\n", + "\n", + "early_stop_callback_params = {\n", + " \"monitor\":monitor,\n", + " \"min_delta\":min_delta,\n", + " \"patience\":patience,\n", + " \"verbose\":verbose,\n", + " \"mode\":mode, \n", + "}\n", + "\n", + "prepare_data_params = {\n", + " #\"batch_dataset_preparation\":batch_dataset_preparation,\n", + " \"test_size_from_dev\":test_size_from_dev,\n", + "}\n", + "\n", + "# Configurações\n", + "config = {'params':{'hparams':hparams,\n", + " 'lightning_params':lightning_params,\n", + " 'early_stop_callback_params':early_stop_callback_params,\n", + " 'prepare_data_params':prepare_data_params },\n", + "\n", + " 'dirpaths':{'data_dirpath':data_dir,\n", + " 'log_dirpath':logs_dir,\n", + " 'cwd_dirpath':root_dir},\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Treinamento do Zero / Recuperação dos pesos" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "08ac3e8118c54ec7a0613790deb89025", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/756k [00:00" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "from platiagro.plotting import plot_data_table\n", + "ax = plot_data_table(df_final)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "df_final.to_csv(dataset, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Salva resultados da tarefa\n", + "\n", + "A plataforma guarda o conteúdo de `/tmp/data/` para as tarefas subsequentes.
\n", + "Use essa pasta para salvar modelos, metadados e outros resultados." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/tmp/data/qgenerator.joblib']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from joblib import dump\n", + "\n", + "artifacts = {\n", + " \"model\":qgenerator_caller,\n", + " \"expand_context\":expand_context,\n", + " \"infer_num_gen_sentences\":infer_num_gen_sentences,\n", + " \"column_context\":column_context,\n", + " \"column_question\":column_question\n", + "} \n", + "\n", + "dump(artifacts, \"/tmp/data/qgenerator.joblib\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Liberando Memória da GPU" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "qgenerator_caller.free_memory()\n", + "del qgenerator_caller\n", + "torch.cuda.empty_cache() " + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "experiment_id": "dd63cfbd-7a97-41ac-bd9b-fd11711ba459", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + }, + "operator_id": "e4150bc8-88f2-4d98-b68a-6c246270c403", + "task_id": "ccfeb3fe-3d3a-43cf-bdc4-d0b07017e468" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tasks/nlp-question-generator/aux_functions.py b/tasks/nlp-question-generator/aux_functions.py new file mode 100644 index 00000000..be5fbf73 --- /dev/null +++ b/tasks/nlp-question-generator/aux_functions.py @@ -0,0 +1,7 @@ +import pandas as pd + +def build_df_result(gen_questions_dict,column_context="context",column_question="questions"): + context_list = [v["context"] for k,v in gen_questions_dict.items()] + questions_list = [v["questions"] for k,v in gen_questions_dict.items()] + df_result = pd.DataFrame({column_context: context_list,column_question: questions_list}) + return df_result \ No newline at end of file diff --git a/tasks/nlp-question-generator/caller.py b/tasks/nlp-question-generator/caller.py new file mode 100644 index 00000000..8b4fb796 --- /dev/null +++ b/tasks/nlp-question-generator/caller.py @@ -0,0 +1,407 @@ +import os +import gc +import sys +import yaml +import torch +import itertools +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from tqdm import tqdm +from transformers import T5Tokenizer +from multiprocessing import cpu_count +from torch.utils.data import DataLoader +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping,GPUStatsMonitor + +# Classes and functions from the project +from dataset import CustomDataset +from model import T5Finetuner +from io_utils import IO_Utils + + +#TODO: Precisa Fazer +class Qgenerator_caller(): + """Modelo Abstrato Herdado por todos os outros modelos""" + def __init__(self, cfg): + #self.config = Config.from_json(cfg) + self.config = cfg + self.io_utils = IO_Utils() + self.MODEL = None + + # Checagem da ordem das chamadas + self.build_called = False + self.train_called = False + self.load_called = False + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + def build(self,**kwargs): + """ + Reponsável por criar os argumentos da classe + """ + # Checagem das Chamadas + self.build_called = True + + # Rcuperando Caminhos + self.data_dirpath = self.config['dirpaths']['data_dirpath'] + self.log_dirpath = self.config['dirpaths']['log_dirpath'] + self.cwd_dirpath = self.config['dirpaths']['cwd_dirpath'] + + # Rcuperando Parâmetros + self.hparams = self.config['params']['hparams'] + self.lightning_params = self.config['params']['lightning_params'] + self.early_stop_callback_params = self.config['params']['early_stop_callback_params'] + self.prepare_data_params = self.config['params']['prepare_data_params'] + #- + self.test_size_from_dev = self.prepare_data_params['test_size_from_dev'] + #- + self.model_name = self.hparams['model_name'] + self.num_gen_sentences = self.hparams['num_gen_sentences'] + self.no_repeat_ngram_size = self.hparams['no_repeat_ngram_size'] + self.train_batch_size = self.hparams['train_batch_size'] + self.eval_batch_size = self.hparams['eval_batch_size'] + self.source_max_length = self.hparams['source_max_length'] + self.target_max_length = self.hparams['target_max_length'] + self.temperature = self.hparams['temperature'] + self.top_p = self.hparams['top_p'] + self.learning_rate = self.hparams['learning_rate'] + self.eps = self.hparams['eps'] + self.seed = self.hparams['seed'] + #- + self.num_gpus = self.lightning_params['num_gpus'] if torch.cuda.is_available() else 0 + self.profiler = self.lightning_params['profiler'] + self.max_epochs = self.lightning_params['max_epochs'] + self.accumulate_grad_batches = self.lightning_params['accumulate_grad_batches'] + self.check_val_every_n_epoch = self.lightning_params['check_val_every_n_epoch'] + self.progress_bar_refresh_rate = self.lightning_params['progress_bar_refresh_rate'] + self.gradient_clip_val = self.lightning_params['gradient_clip_val'] + self.fast_dev_run = self.lightning_params['fast_dev_run'] + #- + self.monitor = self.early_stop_callback_params['monitor'] + self.min_delta = self.early_stop_callback_params['min_delta'] + self.patience = self.early_stop_callback_params['patience'] + self.verbose = self.early_stop_callback_params['verbose'] + self.mode = self.early_stop_callback_params['mode'] + + # Criando parâmetros adicionais + self.tokenizer = T5Tokenizer.from_pretrained(self.config['params']['hparams']['model_name']) + self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + self.MODEL = None + + # Trainer + if self.fast_dev_run: + self.TRAINER = pl.Trainer( + gpus=self.num_gpus, + checkpoint_callback=False, + fast_dev_run=True # Disable checkpoint saving. + ) + else: + + checkpoint_callback = ModelCheckpoint( + dirpath=self.data_dirpath, save_top_k=-1 + ) + + early_stop_callback = EarlyStopping( + monitor=self.early_stop_callback_params['monitor'], + min_delta=self.early_stop_callback_params['min_delta'], + patience=self.early_stop_callback_params['patience'], + verbose=self.early_stop_callback_params['verbose'], + mode=self.early_stop_callback_params['mode'] + ) + + gpu_stats = GPUStatsMonitor() + tb_logger = pl.loggers.TensorBoardLogger(f"{self.log_dirpath}") + + self.TRAINER = pl.Trainer( + gpus= self.lightning_params['num_gpus'], + profiler=self.lightning_params['profiler'], + max_epochs=self.lightning_params['max_epochs'], + accumulate_grad_batches = self.lightning_params['accumulate_grad_batches'], + check_val_every_n_epoch=self.lightning_params['check_val_every_n_epoch'], + progress_bar_refresh_rate=self.lightning_params['progress_bar_refresh_rate'], + callbacks = [early_stop_callback,gpu_stats,checkpoint_callback], + resume_from_checkpoint=None, + logger = tb_logger + ) + + def free_memory(self): + del self.MODEL + del self.TRAINER + del self.tokenizer + del self.device + del self.hparams + del self.config + gc.collect() + torch.cuda.empty_cache() + + def load_model(self,**kwargs): + def verify_args(checkpoint_path): + if not checkpoint_path: + raise ValueError("checkpoint_path é um argumento obrigatório") + + # Checagem das Chamadas + if not (self.build_called ): + raise AssertionError("Para chamar o método load é nececssário chamar o método build") + self.load_called = True + + # Recuperando variáveis kwargs + checkpoint_path = kwargs.get('checkpoint_path',None) + verify_args(checkpoint_path) + + # Atualizando parâmetros de treinamento + hparams = self.hparams.copy() + hparams['device'] = self.device + hparams['track_metrics'] = False + + # Carrgando o modelo + self.MODEL = T5Finetuner.load_from_checkpoint(checkpoint_path = checkpoint_path,map_location=self.device,hparams=hparams) + self.MODEL.to(self.device) + + def train(self,**kwargs): + def verify_args(train_path,valid_path,test_path,glove_weights_path): + if not train_path: + raise ValueError("train_path é um argumento obrigatório") + if not valid_path: + raise ValueError("valid_path é um argumento obrigatório") + if not test_path: + raise ValueError("test_path é um argumento obrigatório") + if not glove_weights_path: + raise ValueError("glove_weights_path é um argumento obrigatório") + + # Checagem das Chamadas + if not (self.build_called ): + raise AssertionError("Para chamar o método train é nececssário chamar o método build") + self.train_called = True + + # Recuperando variáveis kwargs + MODEL_PATH = kwargs.get('MODEL_PATH',None) + train_path = kwargs.get('train_path',None) + valid_path = kwargs.get('valid_path',None) + test_path = kwargs.get('test_path',None) + glove_weights_path = kwargs.get('glove_weights_path',None) + verify_args(train_path,valid_path,test_path,glove_weights_path) + + # Criando datasets + df_result_train= self.io_utils.read_csv_to_df(filepath=os.path.join(self.data_dirpath,train_path)) + df_result_valid= self.io_utils.read_csv_to_df(filepath=os.path.join(self.data_dirpath,valid_path)) + df_result_test= self.io_utils.read_csv_to_df(filepath=os.path.join(self.data_dirpath,test_path)) + + X_train,y_train = np.array(df_result_train['context']),np.array(df_result_train['question']) + X_valid,y_valid = np.array(df_result_valid['context']),np.array(df_result_valid['question']) + X_test ,y_test = np.array(df_result_test['context']),np.array(df_result_test['question']) + + train_dataset = CustomDataset(PREFIX=self.hparams['PREFIX'], + tokenizer=self.tokenizer, + X_context=X_train, + y_question=y_train, + source_max_length=self.hparams['source_max_length'], + target_max_length=self.hparams['target_max_length'], + step='Experiment', + ) + + valid_dataset = CustomDataset(PREFIX=self.hparams['PREFIX'], + tokenizer=self.tokenizer, + X_context=X_valid, + y_question=y_valid, + source_max_length=self.hparams['source_max_length'], + target_max_length=self.hparams['target_max_length'], + step='Experiment', + ) + + test_dataset = CustomDataset(PREFIX=self.hparams['PREFIX'], + tokenizer=self.tokenizer, + X_context=X_test, + y_question=y_test, + source_max_length=self.hparams['source_max_length'], + target_max_length=self.hparams['target_max_length'], + step='Experiment', + ) + + + # Atualizando parâmetros de treinamento + hparams = self.hparams.copy() + hparams['tokenizer'] = self.tokenizer + hparams['device'] = self.device + hparams['glove_weights_path'] = glove_weights_path + hparams['overfit'] = False + hparams['track_metrics'] = True + hparams['train_dataset'] = train_dataset + hparams['valid_dataset'] = valid_dataset + hparams['test_dataset'] = test_dataset + + # Checando se o trainamento será feito do zero ou a partir de um treinamento interrompido + if MODEL_PATH: + self.MODEL = T5Finetuner.load_from_checkpoint( + checkpoint_path = MODEL_PATH, + map_location=self.device, + hparams=hparams + ) + else: + self.MODEL = T5Finetuner( + hparams=hparams + ) + + # Treinando Algorítimos + self.TRAINER.fit(self.MODEL) + + def save_checkpoint(self,checkpoint_path): + # Checagem das Chamadas + if not (self.train_called): + raise AssertionError("Para chamar o método save_checkpoint é nececssário chamar o método train") + + self.TRAINER.save_checkpoint(checkpoint_path) + + def evaluate(self,**kwargs): + # Checagem das Chamadas + if not (self.build_called and (self.train_called or self.load_called)): + raise AssertionError("Para chamar o método evaluate é nececssário chamar o método build e em seguida o método train ou o método load") + + #Testando + self.TRAINER.test(self.MODEL) + + # Salvando os resultados + valid_results_output_path = os.path.join(self.log_dirpath,'valid_results.json') + test_results_output_path = os.path.join(self.log_dirpath,'test_results.json') + valid_results = self.MODEL.valid_metrics_calculator.list_dict_track + test_results = self.MODEL.test_metrics_calculator.list_dict_track + self.io_utils.dump_json(filepath=valid_results_output_path,d=valid_results) + self.io_utils.dump_json(filepath=test_results_output_path,d=test_results) + + return {'valid_results':valid_results, + 'test_results':test_results + } + + def forward(self,**kwargs): + + def verify_args(contexts,num_gen_sentences): + + if not all(isinstance(elem, str) for elem in contexts): + raise ValueError(f"contexts deve ser uma lista de strings mas é {contexts}") + if not num_gen_sentences: + raise ValueError("contexts é um argumento obrigatório") + + # Checagem das Chamadas + if not (self.build_called and (self.train_called or self.load_called)): + raise AssertionError("Para chamar o método forward é nececssário chamar o método build e em seguida o método train ou o método load") + + # Recuperando variáveis kwargs + num_gen_sentences = kwargs.get('num_gen_sentences',None) + contexts = kwargs.get('contexts',None) + verify_args(contexts,num_gen_sentences) + + X_test = np.array(contexts) + + inference_dataset = CustomDataset(PREFIX=self.hparams['PREFIX'], + tokenizer=self.tokenizer, + X_context=X_test, + y_question=[], + source_max_length=self.hparams['source_max_length'], + target_max_length=self.hparams['target_max_length'], + step='Deployment', + ) + + + with torch.no_grad(): + + self.MODEL.eval() + self.MODEL.to(self.device) + inference_dataloader = DataLoader(inference_dataset, batch_size=self.hparams['eval_batch_size'], shuffle=False,num_workers=cpu_count()) + + result = {} + j = 0 + for i,batch in enumerate(tqdm(inference_dataloader)): + source_token_ids, source_masks, original_source = batch + batch_size = len(original_source) + source_token_ids = source_token_ids.to(self.device) + source_masks = source_masks.to(self.device) + logits = self.MODEL.forward(source_token_ids, source_masks,info_requested='logits',num_gen_sentences=num_gen_sentences) + gen_quesitons = [self.tokenizer.decode(l, skip_special_tokens=True) for l in logits] + questions_per_context = [gen_quesitons[s:s+num_gen_sentences] for s in list(range(0,len(gen_quesitons),num_gen_sentences))] + result_batch = {f'{j+k}':{'context':original_source[k],'questions':questions_per_context[k]} for k in range(batch_size)} + result.update(result_batch) + j += batch_size + + + return result + + def prepare_data(self,**kwargs): + def verify_args(squad_train_path,squad_dev_path): + if not squad_train_path: + raise ValueError("squad_train_path é um argumento obrigatório") + if not squad_dev_path: + raise ValueError("squad_dev_path é um argumento obrigatório") + + # Recuperando variáveis kwargs + squad_train_path = kwargs.get('squad_train_path',None) + squad_dev_path = kwargs.get('squad_dev_path',None) + verify_args(squad_train_path,squad_dev_path) + + # Convertendo Json em Dataframe + train_json = self.io_utils.read_json(os.path.join(self.data_dirpath,squad_train_path)) + dev_json = self.io_utils.read_json(os.path.join(self.data_dirpath,squad_dev_path)) + self._read_squad_json_as_dataframe(train_json) + df_train = self._read_squad_json_as_dataframe(train_json) + df_dev = self._read_squad_json_as_dataframe(dev_json) + df_valid, df_test = train_test_split(df_dev, test_size=self.test_size_from_dev) + + # Chunck dataset + df_result_train = df_train #self._convert_tokenized_examples_to_dataset(df=df_train) + df_result_valid = df_valid #self._convert_tokenized_examples_to_dataset(df=df_valid) + df_result_test = df_test #self._convert_tokenized_examples_to_dataset(df=df_test) + #df_result = pd.concat([df_result_train, df_result_valid, df_result_test],ignore_index=True) + + + # Salvando dados + train_output = os.path.join(self.data_dirpath,'squad-train-v1.1.csv') + valid_output = os.path.join(self.data_dirpath,'squad-valid-v1.1.csv') + test_output = os.path.join(self.data_dirpath,'squad-test-v1.1.csv') + #complete_output = os.path.join(self.data_dirpath,'squad-v1.1.csv') + + df_result_train.to_csv(os.path.join(train_output),index=False) + df_result_valid.to_csv(os.path.join(valid_output),index=False) + df_result_test.to_csv(os.path.join(test_output),index=False) + #df_result.to_csv(os.path.join(complete_output),index=False) + + return { + 'prepared_data_train_path':train_output, + 'prepared_data_valid_path':valid_output, + 'prepared_data_test_path':test_output, + } + + def _apply_preprocessing(self,text): + text = " ".join(text.split()).strip() + return text + + def _read_squad_json_as_dataframe(self,json_file): + + context, question, answer, answer_start = [], [], [], [] + + for d in json_file['data']: + + for c in d['paragraphs']: + + for q in c['qas']: + + for a in q['answers']: + + context.append(self._apply_preprocessing(c['context'])) + question.append(self._apply_preprocessing(q['question'])) + answer.append(self._apply_preprocessing(a['text'])) + answer_start.append(a['answer_start']) + + df = pd.DataFrame({'context': context, 'question': question, 'answer': answer, 'answer_start': answer_start}) + + return df + + def build_complete_json(self,gen_questions_dict,reports_contents): + + gen_questions_dict_copy = gen_questions_dict.copy() + for k, v in gen_questions_dict.items(): + rp_infos = reports_contents[k] + report_name = rp_infos['report_name'] + section_name = rp_infos['section_name'] + gen_questions_dict_copy[k]['report_name'] = report_name + gen_questions_dict_copy[k]['section_name'] = section_name + + return gen_questions_dict_copy diff --git a/tasks/nlp-question-generator/dataset.py b/tasks/nlp-question-generator/dataset.py new file mode 100644 index 00000000..2d05ef31 --- /dev/null +++ b/tasks/nlp-question-generator/dataset.py @@ -0,0 +1,63 @@ +import torch +import numpy as np +from typing import List, Union, Optional +from torch.utils.data import Dataset + +class CustomDataset(Dataset): + def __init__(self,PREFIX,tokenizer,X_context:np.ndarray,y_question:Optional[np.ndarray]=[], + source_max_length: int = 32, target_max_length: int = 32,step="Experiment"): + self.tokenizer = tokenizer + self.X_context = X_context + + self.y_question = y_question + self.source_max_length = min(source_max_length + len(PREFIX.split(' ')),512) + self.target_max_length = target_max_length + self.step = step + self.PREFIX = PREFIX + + + if step == "Experiment" and len(y_question)==0: + raise Exception("Na fase de experimento o componente tem de haver um y de referência") + + + if step == "Deployment" and len(y_question)>0: + raise Exception("Na fase de implantação o componente tem deve possuir y=None") + + def __len__(self): + return len(self.X_context) + + def __getitem__(self, idx): + #Source + original_source = self.X_context[idx] + source = f"{self.PREFIX} {original_source}" + source_encoder = self.encoder_plus(source,self.source_max_length) + source_token_ids = source_encoder['input_ids'] + source_mask = source_encoder['attention_mask'] + source_token_ids = torch.tensor(source_token_ids).type(torch.long) + source_mask = torch.tensor(source_mask).type(torch.long) + + if self.step=="Experiment": + # Target + original_target = self.y_question[idx] + target = f"{original_target}" + target_encoder = self.encoder_plus(target,self.target_max_length) + target_token_ids = target_encoder['input_ids'] + target_mask = target_encoder['attention_mask'] + target_token_ids = torch.tensor(target_token_ids).type(torch.long) + target_mask = torch.tensor(target_mask).type(torch.long) + + retorno = (source_token_ids, source_mask, target_token_ids, target_mask, original_source, original_target) + + if self.step=="Deployment": + retorno = (source_token_ids, source_mask, original_source) + + return retorno + + def encoder_plus(self,text,L): + #padding - max_length:de acordo com o atributo max_length - True: maior sentença no batch + #é preciso avaliar a performance disso. O True me parece melhor e o max_lenth me parece + # com maiores chances de funcionar + return self.tokenizer.encode_plus(text, + max_length = L, + truncation=True, + padding="max_length") \ No newline at end of file diff --git a/tasks/nlp-question-generator/expander.py b/tasks/nlp-question-generator/expander.py new file mode 100644 index 00000000..205d5bfc --- /dev/null +++ b/tasks/nlp-question-generator/expander.py @@ -0,0 +1,57 @@ +import os +import pandas as pd + +from io_utils import IO_Utils +from typing import List + +class DocExpander: + def __init__(self): + ''' + Expand documents with the questions gerated from them. + Documents and questions and questins between them are separated by the special token [SEP] + ''' + pass + + def expand_nosql(self,context_questions_map,context_key='context',questions_key = 'questions',apply_filter=False,apply_low_case=False): + """ + Retorna os contextos expandidos no formato noSQL com o id do contexto, tendo como valores + os contextos, contextos expandidos e perguntas. + """ + + context_questions_map_internal = context_questions_map.copy() + + if apply_low_case: + context_questions_map = self.lower_case_dict(context_questions_map) + + if apply_filter: + context_questions_map_internal = self.filter_post_content(content=context_questions_map_internal, + section_names_to_keep=['Capítulo 6', 'Capitulo 6'], + min_context_length_in_tokens=20) + + for k,v in context_questions_map_internal.items(): + context = v[context_key] + questions = v[questions_key] + expanded_context = context + ' ' + ' '.join(questions) + expanded_context = expanded_context.strip() + context_questions_map_internal[k]['expanded_context'] = expanded_context + + return context_questions_map_internal + + # TODO: Desenvolver técnica SQL + def expand_sql(self,df,context_column_name='context',questions_column_name = 'questions'): + """ + Retorna os contextos expandidos no formato de uma nova coluna no dataframe + """ + + df_copy = df.copy() + expanded_context_list = [] + for index, row in df_copy.iterrows(): + questions = row[questions_column_name] + context = row[context_column_name] + expanded_context = context + ' ' + ' '.join(questions) + expanded_context = expanded_context.strip() + expanded_context_list.append(expanded_context) + + df_copy.insert(df.shape[1], "expanded_context", expanded_context_list) + + return df_copy \ No newline at end of file diff --git a/tasks/nlp-question-generator/io_utils.py b/tasks/nlp-question-generator/io_utils.py new file mode 100644 index 00000000..7047bb9d --- /dev/null +++ b/tasks/nlp-question-generator/io_utils.py @@ -0,0 +1,75 @@ +import os +import json +import pandas as pd +from typing import List +from pickle import load as read_pickle +from pickle import dump as dump_pickle + +class IO_Utils(object): + """ + Class with utilities for reading and writing + """ + def __init__(self): + pass + + def read_json(self,filepath:str): + with open(filepath) as f: + json_result = json.load(f) + return json_result + + def reads_json(self,filepath:str): + with open(filepath) as f: + json_result = json.loads(f.read()) + json_result = json.loads(json_result) + return json_result + + def dump_json(self,filepath:str,d,ensure_ascii=False,command='a'): + with open(filepath, command) as fp: + json.dump(d, fp, ensure_ascii=ensure_ascii) + + def dumps_json(self,filepath:str,d,ensure_ascii=False,command='a'): + with open(filepath, command) as fp: + d = json.dumps(d, ensure_ascii=ensure_ascii) + json.dump(d, fp, ensure_ascii=ensure_ascii) + + def read_pickle(self,filepath:str): + with open(filepath, 'rb') as f: + content = read_pickle(f) + return content + + def save_pickle(self,filepath:str,info): + """ + Save info in a picke file + """ + with open(filepath, 'wb') as f: + dump_pickle(info, f) + + def create_folder_structure(self,folder:str): + """ Create the comple folder structure if it does not exists """ + if not os.path.exists(folder): + os.makedirs(folder) + + def read_line_spaced_txt_file(self,filepath:str): + with open(filepath, 'r') as infile: + data = infile.read().splitlines() + return data + + def save_line_spaced_txt_file(self,filepath:str,text_list:List[str]): + with open(filepath, "w") as output: + for row in text_list: + output.write(str(row) + '\n') + + def save_df_to_csv(self,filepath:str,df:pd.DataFrame,zipped=False): + if filepath.split(".")[-1] != "csv": + raise ValueError(f"{filepath} tem de ter a extensão .csv") + + filepath = filepath.split(".csv")[0]+".csv.gz" if zipped else filepath + compression = 'gzip' if zipped else 'infer' + df.to_csv(filepath, compression=compression,index=False) + + + def read_csv_to_df(self,filepath:str): + if ".csv" not in filepath and ".csv.gz" not in filepath: + raise ValueError(f"{filepath} tem de ter a extensão .csv ou csv.gz") + df = pd.read_csv(filepath) + return df \ No newline at end of file diff --git a/tasks/nlp-question-generator/metrics_calculator.py b/tasks/nlp-question-generator/metrics_calculator.py new file mode 100644 index 00000000..ac913281 --- /dev/null +++ b/tasks/nlp-question-generator/metrics_calculator.py @@ -0,0 +1,232 @@ +import numpy as np +#from nlgeval import NLGEval +from gensim.models import KeyedVectors +import torch +import numpy as np +import nltk +nltk.download('stopwords') + +class Metrics_Calculator(object): + + def __init__(self,hparams,glove_comparer): + + + super(Metrics_Calculator, self).__init__() + #self.nlg_eval = NLGEval(metrics_to_omit=['EmbeddingAverageCosineSimilairty', 'EmbeddingAverageCosineSimilarity','GreedyMatchingScore','SkipThoughtCS','VectorExtremaCosineSimilarity']) + self.list_dict_track = {"data":[]} + self.hparams = hparams + self.glove_comparer = glove_comparer + + + def build_json_results(self, + context, + generated_question_list, + target_question_list, + row_mean_metrics): + + """ + Cria json para cada linha que será salvo para monitorar as métricas em self.list_dict_track + """ + new_info = {} + new_info["context"] =context + new_info["generated_question_list"] =generated_question_list + new_info["target_question_list"] =target_question_list + new_info["row_mean_metrics"] =row_mean_metrics + + + return new_info + + def track_metrics_row(self,original_target,gen_target_options_list): + """ + Calcula as métricas para cada par question-context + """ + # bleu_1_list = [] + # bleu_2_list = [] + # bleu_3_list = [] + # bleu_4_list = [] + # CIDEr_list = [] + # ROUGE_L_list = [] + cossine_similarity_list = [] + + for gen_target_option in gen_target_options_list: + + # metrics_dict = self.nlg_eval.compute_individual_metrics(ref=[original_target],hyp=gen_target_option)#ref:List[str] , hyp:str + # bleu_1_list.append(metrics_dict['Bleu_1']) + # bleu_2_list.append(metrics_dict['Bleu_2']) + # bleu_3_list.append(metrics_dict['Bleu_3']) + # bleu_4_list.append(metrics_dict['Bleu_4']) + # CIDEr_list.append(metrics_dict['CIDEr']) + # ROUGE_L_list.append(metrics_dict['ROUGE_L']) + + cs = self.glove_comparer.compare_sentences_with_cossine_similarity(original_target,gen_target_option) + cossine_similarity_list.append(cs) + + + + # row_metrics_dict = {"Bleu_1":np.mean(bleu_1_list), + # "Bleu_2":np.mean(bleu_2_list), + # "Bleu_3":np.mean(bleu_3_list), + # "Bleu_4":np.mean(bleu_4_list), + # "CIDEr":np.mean(CIDEr_list), + # "ROUGE_L":np.mean(ROUGE_L_list), + row_metrics_dict = { + "Glove_Cossine_Similarity":np.mean(cossine_similarity_list)} + + return row_metrics_dict + + + + def generate_sentences_and_track_metrics_batch(self,logits,original_targets_batch,original_sources_batch,save_track_dict=False): + """ + Calcula métricas para todo o batch + """ + # batch_bleu_1_list = [] + # batch_bleu_2_list = [] + # batch_bleu_3_list = [] + # batch_bleu_4_list = [] + # batch_CIDEr_list = [] + # batch_ROUGE_L_list = [] + batch_Glove_Cossine_Similarity_list = [] + + + #batch + for i,(original_target,original_source) in enumerate(zip(original_targets_batch,original_sources_batch)): + #linha + relevant_logits = logits[i*self.hparams.num_gen_sentences:self.hparams.num_gen_sentences+i*self.hparams.num_gen_sentences] + gen_target_options_list = [self.hparams.tokenizer.decode(l, skip_special_tokens=True) for l in relevant_logits] + row_metrics_dict = self.track_metrics_row(original_target=original_target,gen_target_options_list=gen_target_options_list) + + if save_track_dict: + self.list_dict_track["data"].append(self.build_json_results(context=original_source, + generated_question_list=gen_target_options_list, + target_question_list=original_target, + row_mean_metrics = row_metrics_dict)) + + # batch_bleu_1_list.append(row_metrics_dict['Bleu_1']) + # batch_bleu_2_list.append(row_metrics_dict['Bleu_2']) + # batch_bleu_3_list.append(row_metrics_dict['Bleu_3']) + # batch_bleu_4_list.append(row_metrics_dict['Bleu_4']) + # batch_CIDEr_list.append(row_metrics_dict['CIDEr']) + # batch_ROUGE_L_list.append(row_metrics_dict['ROUGE_L']) + batch_Glove_Cossine_Similarity_list.append(row_metrics_dict['Glove_Cossine_Similarity']) + + + # batch_metrics_dict = {"Batch_Bleu_1":np.mean(batch_bleu_1_list), + # "Batch_Bleu_2":np.mean(batch_bleu_2_list), + # "Batch_Bleu_3":np.mean(batch_bleu_3_list), + # "Batch_Bleu_4":np.mean(batch_bleu_4_list), + # "Batch_CIDEr":np.mean(batch_CIDEr_list), + # "Batch_ROUGE_L":np.mean(batch_ROUGE_L_list), + batch_metrics_dict = { + "Batch_Glove_Cossine_Similarity":np.mean(batch_Glove_Cossine_Similarity_list) + } + + return batch_metrics_dict + + +class Glove_Embeddings_Comparer(object): + """ + Classes reponsável por criar a matriz de glove embeddings com os textos fornecidos + """ + def __init__(self,glove_weights_path:str,device:str): + super(Glove_Embeddings_Comparer , self).__init__() + + self.device = device + self.glove_path = glove_weights_path + self.glove = None + self.glove_infos = None + self.stopwords = nltk.corpus.stopwords.words('portuguese') + self.extract_glove_properties() + + + def load_glove_vector(self): + """ + Carrega os vetores glove no formato word2vec + """ + + #glove = KeyedVectors.load_word2vec_format(self.glove_path) + try: + glove = KeyedVectors.load_word2vec_format(self.glove_path,no_header=False) + print("load_word2vec_format with no_header=False") + except ValueError: + glove = KeyedVectors.load_word2vec_format(self.glove_path,no_header=True) + print("load_word2vec_format with no_header=True") + + return glove + + def extract_glove_properties(self): + """ + Extrai todas as propriedades dos vetores glove considerando o mapeamento ente palavras e vetores + """ + glove = self.load_glove_vector() + glove_shape = glove.vectors.shape + glove_dim = glove.vector_size + glove_words = glove.index_to_key + glove_vectors = torch.from_numpy(glove.vectors).to(self.device) + glove_vocab = {word:i for i, word in enumerate(glove_words)} + + glove_infos = {'glove_shape':glove_shape, + 'glove_dim':glove_dim, + 'glove_words':glove_words, + 'glove_vectors':glove_vectors, + 'glove_vocab':glove_vocab} + + + self.glove = glove + self.glove_infos = glove_infos + + def separate_punctuation_from_words(self,text): + """" + Pontuações são separadas das palavras porque caso estejam juntas esta palavra não estará no vetor de embeddings + """ + punctuation_list = '!(),.:;?' + for punct in list(punctuation_list): + text = text.replace(punct,f" {punct} ") + + text = text.strip() + return text + + def tokenize_text(self,text: list = None): + """ + Transforma o texto em lista de palavras + """ + text = self.separate_punctuation_from_words(text) + tokenize_list = text.split(" ") + tokenize_list = [token for token in tokenize_list if ((token not in self.stopwords) and (token in self.glove_infos['glove_vocab']))] + return tokenize_list + + def cosine_similarity_calculator(self,a, b): + """ + Caclula a similaridade de cossenos entre dois vetores + """ + nominator = np.dot(a, b) + + a_norm = np.sqrt(np.sum(a**2)) + b_norm = np.sqrt(np.sum(b**2)) + + denominator = a_norm * b_norm + + cosine_similarity = nominator / denominator + + return cosine_similarity + + def compare_sentences_with_cossine_similarity(self,text1,text2): + """ + Compara duas sentenças com similaridade de cossenos + """ + tokenize_list1 = self.tokenize_text(text1) + tokenize_list2 = self.tokenize_text(text2) + + embeddigns_sentence1 = [self.glove.get_vector(t1) for t1 in tokenize_list1] + embeddigns_sentence1_mean = np.mean(embeddigns_sentence1,axis=0) + embeddigns_sentence2 = [self.glove.get_vector(t2) for t2 in tokenize_list2] + embeddigns_sentence2_mean = np.mean(embeddigns_sentence2,axis=0) + cosine_similarity = self.cosine_similarity_calculator(embeddigns_sentence1_mean,embeddigns_sentence2_mean) + cosine_similarity = np.float64(cosine_similarity) + return cosine_similarity + + + def batch_average_cossine_similarity(self,list_ref_texts,list_gen_texts): + pass + + diff --git a/tasks/nlp-question-generator/model-question-generator.py b/tasks/nlp-question-generator/model-question-generator.py deleted file mode 100644 index 7c150ec7..00000000 --- a/tasks/nlp-question-generator/model-question-generator.py +++ /dev/null @@ -1,463 +0,0 @@ -import torch -import nltk -from tqdm import tqdm -from multiprocessing import cpu_count -from typing import List, Union, Optional -import numpy as np -import pytorch_lightning as pl -from torch.utils.data import DataLoader -from transformers import T5ForConditionalGeneration -from nlgeval import NLGEval -from gensim.models import KeyedVectors -nltk.download('stopwords') - -class Metrics_Calculator(object): - - def __init__(self,hparams,glove_comparer): - - - super(Metrics_Calculator, self).__init__() - self.nlg_eval = NLGEval(metrics_to_omit=['EmbeddingAverageCosineSimilairty', 'EmbeddingAverageCosineSimilarity','GreedyMatchingScore','SkipThoughtCS','VectorExtremaCosineSimilarity']) - self.list_dict_track = {"data":[]} - self.hparams = hparams - self.glove_comparer = glove_comparer - - - def build_json_results(self, - context, - generated_question_list, - target_question_list, - row_mean_metrics): - - """ - Cria json para cada linha que será salvo para monitorar as métricas em self.list_dict_track - """ - new_info = {} - new_info["context"] =context - new_info["generated_question_list"] =generated_question_list - new_info["target_question_list"] =target_question_list - new_info["row_mean_metrics"] =row_mean_metrics - - - return new_info - - def track_metrics_row(self,original_target,gen_target_options_list): - """ - Calcula as métricas para cada par question-context - """ - bleu_1_list = [] - bleu_2_list = [] - bleu_3_list = [] - bleu_4_list = [] - CIDEr_list = [] - ROUGE_L_list = [] - cossine_similarity_list = [] - - for gen_target_option in gen_target_options_list: - - metrics_dict = self.nlg_eval.compute_individual_metrics(ref=[original_target],hyp=gen_target_option)#ref:List[str] , hyp:str - bleu_1_list.append(metrics_dict['Bleu_1']) - bleu_2_list.append(metrics_dict['Bleu_2']) - bleu_3_list.append(metrics_dict['Bleu_3']) - bleu_4_list.append(metrics_dict['Bleu_4']) - CIDEr_list.append(metrics_dict['CIDEr']) - ROUGE_L_list.append(metrics_dict['ROUGE_L']) - cs = self.glove_comparer.compare_sentences_with_cossine_similarity(original_target,gen_target_option) - cossine_similarity_list.append(cs) - - - - row_metrics_dict = {"Bleu_1":np.mean(bleu_1_list), - "Bleu_2":np.mean(bleu_2_list), - "Bleu_3":np.mean(bleu_3_list), - "Bleu_4":np.mean(bleu_4_list), - "CIDEr":np.mean(CIDEr_list), - "ROUGE_L":np.mean(ROUGE_L_list), - "Glove_Cossine_Similarity":np.mean(cossine_similarity_list)} - - return row_metrics_dict - - - - def generate_sentences_and_track_metrics_batch(self,logits,original_targets_batch,original_sources_batch,save_track_dict=False): - """ - Calcula métricas para todo o batch - """ - batch_bleu_1_list = [] - batch_bleu_2_list = [] - batch_bleu_3_list = [] - batch_bleu_4_list = [] - batch_CIDEr_list = [] - batch_ROUGE_L_list = [] - batch_Glove_Cossine_Similarity_list = [] - - - #batch - for i,(original_target,original_source) in enumerate(zip(original_targets_batch,original_sources_batch)): - #linha - relevant_logits = logits[i*self.hparams.num_gen_sentences:self.hparams.num_gen_sentences+i*self.hparams.num_gen_sentences] - gen_target_options_list = [self.hparams.tokenizer.decode(l, skip_special_tokens=True) for l in relevant_logits] - row_metrics_dict = self.track_metrics_row(original_target=original_target,gen_target_options_list=gen_target_options_list) - - if save_track_dict: - self.list_dict_track["data"].append(self.build_json_results(context=original_source, - generated_question_list=gen_target_options_list, - target_question_list=original_target, - row_mean_metrics = row_metrics_dict)) - - batch_bleu_1_list.append(row_metrics_dict['Bleu_1']) - batch_bleu_2_list.append(row_metrics_dict['Bleu_2']) - batch_bleu_3_list.append(row_metrics_dict['Bleu_3']) - batch_bleu_4_list.append(row_metrics_dict['Bleu_4']) - batch_CIDEr_list.append(row_metrics_dict['CIDEr']) - batch_ROUGE_L_list.append(row_metrics_dict['ROUGE_L']) - batch_Glove_Cossine_Similarity_list.append(row_metrics_dict['Glove_Cossine_Similarity']) - - - batch_metrics_dict = {"Batch_Bleu_1":np.mean(batch_bleu_1_list), - "Batch_Bleu_2":np.mean(batch_bleu_2_list), - "Batch_Bleu_3":np.mean(batch_bleu_3_list), - "Batch_Bleu_4":np.mean(batch_bleu_4_list), - "Batch_CIDEr":np.mean(batch_CIDEr_list), - "Batch_ROUGE_L":np.mean(batch_ROUGE_L_list), - "Batch_Glove_Cossine_Similarity":np.mean(batch_Glove_Cossine_Similarity_list) - } - - return batch_metrics_dict - - -class Glove_Embeddings_Comparer(object): - """ - Classes reponsável por criar a matriz de glove embeddings com os textos fornecidos - """ - def __init__(self,glove_weights_path:str,device:str): - super(Glove_Embeddings_Comparer , self).__init__() - - self.device = device - self.glove_path = glove_weights_path - self.glove = None - self.glove_infos = None - self.stopwords = nltk.corpus.stopwords.words('portuguese') - self.extract_glove_properties() - - - def load_glove_vector(self): - """ - Carrega os vetores glove no formato word2vec - """ - - #glove = KeyedVectors.load_word2vec_format(self.glove_path) - try: - glove = KeyedVectors.load_word2vec_format(self.glove_path,no_header=False) - print("load_word2vec_format with no_header=False") - except ValueError: - glove = KeyedVectors.load_word2vec_format(self.glove_path,no_header=True) - print("load_word2vec_format with no_header=True") - - return glove - - def extract_glove_properties(self): - """ - Extrai todas as propriedades dos vetores glove considerando o mapeamento ente palavras e vetores - """ - glove = self.load_glove_vector() - glove_shape = glove.vectors.shape - glove_dim = glove.vector_size - glove_words = glove.index_to_key - glove_vectors = torch.from_numpy(glove.vectors).to(self.device) - glove_vocab = {word:i for i, word in enumerate(glove_words)} - - glove_infos = {'glove_shape':glove_shape, - 'glove_dim':glove_dim, - 'glove_words':glove_words, - 'glove_vectors':glove_vectors, - 'glove_vocab':glove_vocab} - - - self.glove = glove - self.glove_infos = glove_infos - - def separate_punctuation_from_words(self,text): - """" - Pontuações são separadas das palavras porque caso estejam juntas esta palavra não estará no vetor de embeddings - """ - punctuation_list = '!(),.:;?' - for punct in list(punctuation_list): - text = text.replace(punct,f" {punct} ") - - text = text.strip() - return text - - def tokenize_text(self,text: list = None): - """ - Transforma o texto em lista de palavras - """ - text = self.separate_punctuation_from_words(text) - tokenize_list = text.split(" ") - tokenize_list = [token for token in tokenize_list if ((token not in self.stopwords) and (token in self.glove_infos['glove_vocab']))] - return tokenize_list - - def cosine_similarity_calculator(self,a, b): - """ - Caclula a similaridade de cossenos entre dois vetores - """ - nominator = np.dot(a, b) - - a_norm = np.sqrt(np.sum(a**2)) - b_norm = np.sqrt(np.sum(b**2)) - - denominator = a_norm * b_norm - - cosine_similarity = nominator / denominator - - return cosine_similarity - - def compare_sentences_with_cossine_similarity(self,text1,text2): - """ - Compara duas sentenças com similaridade de cossenos - """ - tokenize_list1 = self.tokenize_text(text1) - tokenize_list2 = self.tokenize_text(text2) - - embeddigns_sentence1 = [self.glove.get_vector(t1) for t1 in tokenize_list1] - embeddigns_sentence1_mean = np.mean(embeddigns_sentence1,axis=0) - embeddigns_sentence2 = [self.glove.get_vector(t2) for t2 in tokenize_list2] - embeddigns_sentence2_mean = np.mean(embeddigns_sentence2,axis=0) - cosine_similarity = self.cosine_similarity_calculator(embeddigns_sentence1_mean,embeddigns_sentence2_mean) - cosine_similarity = np.float64(cosine_similarity) - return cosine_similarity - - - def batch_average_cossine_similarity(self,list_ref_texts,list_gen_texts): - pass - - -class T5Finetuner(pl.LightningModule): - - def __init__(self, - hparams): - - super(T5Finetuner, self).__init__() - - - self.hparams = hparams - - # ---------- fixing seeds - # self.seed_everything() - pl.utilities.seed.seed_everything(seed = self.hparams.seed) - - - # ---------- Model - self.model = T5ForConditionalGeneration.from_pretrained(self.hparams.model_name) - - #----------Other infos - self.i = 0 - self.step = "Experiment" - self.softmax = torch.nn.Softmax(dim=1) - self.loss_funtion = torch.nn.CrossEntropyLoss() - - - #----------Metrics Trackers - if self.hparams.track_metrics == True: - glove_comparer = Glove_Embeddings_Comparer(glove_weights_path=self.hparams.glove_weights_path,device=self.hparams.device) - self.valid_metrics_calculator = Metrics_Calculator(self.hparams,glove_comparer) - self.test_metrics_calculator = Metrics_Calculator(self.hparams,glove_comparer) - - def predict(self,X_context:np.ndarray,num_gen_sentences=10): - self.step = "Deployment" - self.model.eval() - self.hparams["all_data"] = {'X_test':np.array(X_context),'y_test':[]} - self.hparams.num_gen_sentences = num_gen_sentences - result = {} - j = 0 - for i,batch in enumerate(tqdm(self.test_dataloader())): - source_token_ids, source_masks, original_source = batch - logits = self.forward(source_token_ids, source_masks,info_requested='logits') - gen_quesitons = [self.hparams.tokenizer.decode(l, skip_special_tokens=True) for l in logits] - questions_per_context = [gen_quesitons[s:s+self.hparams.num_gen_sentences] for s in list(range(0,len(gen_quesitons),self.hparams.num_gen_sentences))] - result_batch = {j+k:{'context':original_source[k],'questions':questions_per_context[k]} for k in range(len(original_source))} - result.update(result_batch) - j +=len(batch) - - return result - - - def forward(self, source_token_ids, source_mask, target_token_ids=None, - target_mask=None,info_requested='loss'): - - - if info_requested=='loss': - - # TODO calcular a loss dado os target_token_ids - outputs = self.model(input_ids = source_token_ids, attention_mask = source_mask,labels = target_token_ids) - - # loss, predicted_token_ids = outputs[:2] - loss = outputs[0] - result = loss - if info_requested=='logits': - #num_return_sequences must be 1 - if info_requested=='logits': - decoder_output = self.model.generate( - input_ids =source_token_ids, - attention_mask=source_mask, - max_length= self.hparams.target_max_length, - do_sample=True, - num_return_sequences=self.hparams.num_gen_sentences, - temperature = self.hparams.temperature, - top_p=self.hparams.top_p, - top_k=0) - - result = decoder_output - - return result - - def training_step(self, batch, batch_nb): - # batch - source_token_ids, source_masks, target_token_ids, target_masks, original_sources, original_targets = batch - - # fwd - loss = self.forward(source_token_ids, source_masks, target_token_ids,info_requested='loss') - batch_metrics_dict = {'loss':loss} - return batch_metrics_dict - - - def validation_step(self, batch, batch_nb): - # batch - source_token_ids, source_masks, target_token_ids, target_masks, original_sources, original_targets = batch - - # fwd - loss = self.forward(source_token_ids, source_masks, target_token_ids,info_requested='loss') - logits = self.forward(source_token_ids, source_masks, target_token_ids,info_requested='logits') - - #Calc Metrics and Saving Results - batch_metrics_dict = self.valid_metrics_calculator.generate_sentences_and_track_metrics_batch(logits,original_targets,original_sources,save_track_dict=True) - - tensorboard_logs = {'valid_'+key: value for (key, value) in batch_metrics_dict.items()} - tensorboard_logs['valid_loss'] = loss.item() - - #include special values to batch metrics dict - batch_metrics_dict['loss'] = loss - batch_metrics_dict['log'] = tensorboard_logs - - for key, value in batch_metrics_dict.items(): - self.log(key, value, on_step=True, prog_bar=True, logger=True) - - return batch_metrics_dict - - def test_step(self, batch, batch_nb): - - # batch - source_token_ids, source_masks, target_token_ids, target_masks, original_sources, original_targets = batch - - # fwd - logits = self.forward(source_token_ids, source_masks, target_token_ids,info_requested='logits') - - #Calc Metrics and Saving Results - batch_metrics_dict = self.test_metrics_calculator.generate_sentences_and_track_metrics_batch(logits,original_targets,original_sources,save_track_dict=True) - - tensorboard_logs = {'test_'+key: value for (key, value) in batch_metrics_dict.items()} - - #include special values to batch metrics dict - batch_metrics_dict['log'] = tensorboard_logs - for key, value in batch_metrics_dict.items(): - self.log(key, value, on_step=True, prog_bar=True, logger=True) - - return batch_metrics_dict - - def get_epoch_results(self,outputs,step='train'): - - tensorboard_logs = {} - - if step != "test": - temp_avg_loss_batch = [x["loss"] for x in outputs] - avg_loss = torch.stack(temp_avg_loss_batch).mean() - - if step != "train": - temp_avg_bleu1_batch = [x["Batch_Bleu_1"] for x in outputs] - temp_avg_bleu2_batch = [x["Batch_Bleu_2"] for x in outputs] - temp_avg_bleu3_batch = [x["Batch_Bleu_3"] for x in outputs] - temp_avg_bleu4_batch = [x["Batch_Bleu_4"] for x in outputs] - temp_avg_cider_batch = [x["Batch_CIDEr"] for x in outputs] - temp_avg_rougeL_batch = [x["Batch_ROUGE_L"] for x in outputs] - temp_avg_glove_cossine_similarity = [x["Batch_Glove_Cossine_Similarity"] for x in outputs] - - avg_bleu1 = np.stack(temp_avg_bleu1_batch).mean() - avg_bleu2 = np.stack(temp_avg_bleu2_batch).mean() - avg_bleu3 = np.stack(temp_avg_bleu3_batch).mean() - avg_bleu4 = np.stack(temp_avg_bleu4_batch).mean() - avg_cider = np.stack(temp_avg_cider_batch).mean() - avg_rougeL = np.stack(temp_avg_rougeL_batch).mean() - avg_glove_cossine_similarity = np.stack(temp_avg_glove_cossine_similarity).mean() - - tensorboard_logs[f"avg_{step}_bleu1"] = avg_bleu1 - tensorboard_logs[f"avg_{step}_bleu2"] = avg_bleu2 - tensorboard_logs[f"avg_{step}_bleu3"] = avg_bleu3 - tensorboard_logs[f"avg_{step}_bleu4"] = avg_bleu4 - tensorboard_logs[f"avg_{step}_cider"] = avg_cider - tensorboard_logs[f"avg_{step}_rougeL"] = avg_rougeL - tensorboard_logs[f"avg_{step}_rougeL"] = avg_glove_cossine_similarity - - if step != "test": - tensorboard_logs[f"avg_{step}_loss"] = avg_loss.item() - - epoch_dict = tensorboard_logs.copy() - epoch_dict['log'] = tensorboard_logs - - for key, value in epoch_dict.items(): - self.log(key, value, on_epoch=True, prog_bar=True, logger=True) - - return epoch_dict - - def training_epoch_end(self, outputs): - if not outputs: - return {} - epoch_dict = self.get_epoch_results(outputs,'train') - - - def validation_epoch_end(self, outputs): - epoch_dict = self.get_epoch_results(outputs,'valid') - return epoch_dict #must do to save checkpoints - - def test_epoch_end(self, outputs): - epoch_dict = self.get_epoch_results(outputs,'test') - - - def configure_optimizers(self): - return torch.optim.AdamW( - [p for p in self.parameters() if p.requires_grad], - lr=self.hparams.learning_rate, eps=self.hparams.eps) - - def train_dataloader(self): - self.train_dataset = self.hparams.CustomDataset(PREFIX=self.hparams.PREFIX, - tokenizer=self.hparams.tokenizer, - X_context=self.hparams.all_data['X_train'], - y_question=self.hparams.all_data['y_train'], - source_max_length=self.hparams.source_max_length, - target_max_length=self.hparams.target_max_length, - step=self.step, - ) - shuffle = False if self.hparams.overfit else True - return DataLoader(self.train_dataset, batch_size=self.hparams.train_batch_size, shuffle=shuffle,num_workers=cpu_count()) - - def val_dataloader(self): - self.valid_dataset = self.hparams.CustomDataset(PREFIX=self.hparams.PREFIX, - tokenizer=self.hparams.tokenizer, - X_context=self.hparams.all_data['X_valid'], - y_question=self.hparams.all_data['y_valid'], - source_max_length=self.hparams.source_max_length, - target_max_length=self.hparams.target_max_length, - step=self.step, - ) - return DataLoader(self.valid_dataset, batch_size=self.hparams.eval_batch_size, shuffle=False,num_workers=cpu_count()) - - def test_dataloader(self): - self.test_dataset = self.hparams.CustomDataset(PREFIX=self.hparams.PREFIX, - tokenizer=self.hparams.tokenizer, - X_context=self.hparams.all_data['X_test'], - y_question=self.hparams.all_data['y_test'], - source_max_length=self.hparams.source_max_length, - target_max_length=self.hparams.target_max_length, - step=self.step, - ) - return DataLoader(self.test_dataset, batch_size=self.hparams.eval_batch_size,shuffle=False, num_workers=cpu_count()) - diff --git a/tasks/nlp-question-generator/model.py b/tasks/nlp-question-generator/model.py new file mode 100644 index 00000000..9f2c8ffd --- /dev/null +++ b/tasks/nlp-question-generator/model.py @@ -0,0 +1,199 @@ +import torch +from tqdm import tqdm +from multiprocessing import cpu_count +from typing import List, Union, Optional +import numpy as np +import pandas as pd +import pytorch_lightning as pl +from torch.utils.data import DataLoader +from transformers import T5ForConditionalGeneration +from metrics_calculator import Glove_Embeddings_Comparer, Metrics_Calculator + +class T5Finetuner(pl.LightningModule): + + def __init__(self, + hparams): + + super(T5Finetuner, self).__init__() + + + self.hparams = hparams + + # ---------- fixing seeds + # self.seed_everything() + pl.utilities.seed.seed_everything(seed = self.hparams.seed) + + + # ---------- Model + self.model = T5ForConditionalGeneration.from_pretrained(self.hparams.model_name) + self.model.to(self.hparams.device) + + # #----------Metrics Trackers + if self.hparams.track_metrics == True: + glove_comparer = Glove_Embeddings_Comparer(glove_weights_path=self.hparams.glove_weights_path,device=self.hparams.device) + self.valid_metrics_calculator = Metrics_Calculator(self.hparams,glove_comparer) + self.test_metrics_calculator = Metrics_Calculator(self.hparams,glove_comparer) + + + def forward(self, source_token_ids, source_mask, target_token_ids=None, + target_mask=None,info_requested='loss',num_gen_sentences = None): + + + if info_requested=='loss': + + # TODO calcular a loss dado os target_token_ids + outputs = self.model(input_ids = source_token_ids, attention_mask = source_mask,labels = target_token_ids) + + + # loss, predicted_token_ids = outputs[:2] + loss = outputs[0] + result = loss + if info_requested=='logits': + #num_return_sequences must be 1 + if info_requested=='logits': + + num_gen_sentences = num_gen_sentences if num_gen_sentences else self.hparams.num_gen_sentences + + decoder_output = self.model.generate( + input_ids =source_token_ids, + attention_mask=source_mask, + max_length= self.hparams.target_max_length, + do_sample=True, + num_return_sequences=num_gen_sentences, + temperature = self.hparams.temperature, + top_p=self.hparams.top_p, + top_k=0) + + result = decoder_output + + return result + + def training_step(self, batch, batch_nb): + # batch + source_token_ids, source_masks, target_token_ids, target_masks, original_sources, original_targets = batch + + # fwd + loss = self.forward(source_token_ids, source_masks, target_token_ids,info_requested='loss') + batch_metrics_dict = {} + batch_metrics_dict['train_loss'] = loss.item() + batch_metrics_dict = {'loss':loss} + return batch_metrics_dict + + + def validation_step(self, batch, batch_nb): + # batch + source_token_ids, source_masks, target_token_ids, target_masks, original_sources, original_targets = batch + + # fwd + loss = self.forward(source_token_ids, source_masks, target_token_ids,info_requested='loss') + logits = self.forward(source_token_ids, source_masks, target_token_ids,info_requested='logits') + + #Calc Metrics and Saving Results + batch_metrics_dict = self.valid_metrics_calculator.generate_sentences_and_track_metrics_batch(logits,original_targets,original_sources,save_track_dict=True) + + batch_metrics_dict = {'valid_'+key: value for (key, value) in batch_metrics_dict.items()} + batch_metrics_dict['valid_loss'] = loss.item() + + #include special values to batch metrics dict + batch_metrics_dict['loss'] = loss + + for key, value in batch_metrics_dict.items(): + self.log(key, value, on_step=True, prog_bar=True, logger=True) + + return batch_metrics_dict + + def test_step(self, batch, batch_nb): + + # batch + source_token_ids, source_masks, target_token_ids, target_masks, original_sources, original_targets = batch + + # fwd + logits = self.forward(source_token_ids, source_masks, target_token_ids,info_requested='logits') + + #Calc Metrics and Saving Results + batch_metrics_dict = self.test_metrics_calculator.generate_sentences_and_track_metrics_batch(logits,original_targets,original_sources,save_track_dict=True) + + batch_metrics_dict = {'test_'+key: value for (key, value) in batch_metrics_dict.items()} + + + #include special values to batch metrics dict + for key, value in batch_metrics_dict.items(): + self.log(key, value, on_step=True, prog_bar=True, logger=True) + + return batch_metrics_dict + + def get_epoch_results(self,outputs,step='train'): + + tensorboard_logs = {} + + if step != "test": + temp_avg_loss_batch = [x["loss"] for x in outputs] + avg_loss = torch.stack(temp_avg_loss_batch).mean() + + if step != "train": + # temp_avg_bleu1_batch = [x[f"{step}_Batch_Bleu_1"] for x in outputs] + # temp_avg_bleu2_batch = [x[f"{step}_Batch_Bleu_2"] for x in outputs] + # temp_avg_bleu3_batch = [x[f"{step}_Batch_Bleu_3"] for x in outputs] + # temp_avg_bleu4_batch = [x[f"{step}_Batch_Bleu_4"] for x in outputs] + # temp_avg_cider_batch = [x[f"{step}_Batch_CIDEr"] for x in outputs] + # temp_avg_rougeL_batch = [x[f"{step}_Batch_ROUGE_L"] for x in outputs] + temp_avg_glove_cossine_similarity = [x[f"{step}_Batch_Glove_Cossine_Similarity"] for x in outputs] + + # avg_bleu1 = np.stack(temp_avg_bleu1_batch).mean() + # avg_bleu2 = np.stack(temp_avg_bleu2_batch).mean() + # avg_bleu3 = np.stack(temp_avg_bleu3_batch).mean() + # avg_bleu4 = np.stack(temp_avg_bleu4_batch).mean() + # avg_cider = np.stack(temp_avg_cider_batch).mean() + # avg_rougeL = np.stack(temp_avg_rougeL_batch).mean() + avg_glove_cossine_similarity = np.stack(temp_avg_glove_cossine_similarity).mean() + + # tensorboard_logs[f"avg_{step}_bleu1"] = avg_bleu1 + # tensorboard_logs[f"avg_{step}_bleu2"] = avg_bleu2 + # tensorboard_logs[f"avg_{step}_bleu3"] = avg_bleu3 + # tensorboard_logs[f"avg_{step}_bleu4"] = avg_bleu4 + # tensorboard_logs[f"avg_{step}_cider"] = avg_cider + # tensorboard_logs[f"avg_{step}_rougeL"] = avg_rougeL + tensorboard_logs[f"avg_{step}_glove_cossine_similarity"] = avg_glove_cossine_similarity + + if step != "test": + tensorboard_logs[f"avg_{step}_loss"] = avg_loss.item() + + epoch_dict = tensorboard_logs.copy() + epoch_dict['log'] = tensorboard_logs + + for key, value in epoch_dict.items(): + self.log(key, value, on_epoch=True, prog_bar=True, logger=True) + + return epoch_dict + + def training_epoch_end(self, outputs): + if not outputs: + return {} + epoch_dict = self.get_epoch_results(outputs,'train') + + + def validation_epoch_end(self, outputs): + epoch_dict = self.get_epoch_results(outputs,'valid') + return epoch_dict #must do to save checkpoints + + def test_epoch_end(self, outputs): + epoch_dict = self.get_epoch_results(outputs,'test') + + + def configure_optimizers(self): + return torch.optim.AdamW( + [p for p in self.parameters() if p.requires_grad], + lr=self.hparams.learning_rate, eps=self.hparams.eps) + + def train_dataloader(self): + shuffle = False if self.hparams.overfit else True + return DataLoader(self.hparams.train_dataset, batch_size=self.hparams.train_batch_size, shuffle=shuffle,num_workers=cpu_count()) + + def val_dataloader(self): + + return DataLoader(self.hparams.valid_dataset, batch_size=self.hparams.eval_batch_size, shuffle=False,num_workers=cpu_count()) + + def test_dataloader(self): + + return DataLoader(self.hparams.test_dataset, batch_size=self.hparams.eval_batch_size,shuffle=False, num_workers=cpu_count()) + diff --git a/tasks/nlp-question-generator/params.yaml b/tasks/nlp-question-generator/params.yaml new file mode 100644 index 00000000..c622c00a --- /dev/null +++ b/tasks/nlp-question-generator/params.yaml @@ -0,0 +1,38 @@ +# https://geekflare.com/python-yaml-intro/ +prepare_data: + test_size_from_dev: 0.5 + +hparams: + model_name: "unicamp-dl/ptt5-base-portuguese-vocab" + PREFIX: "gerador_perguntas:" + save_every: 5000 + num_gen_sentences: 2 + #num_gen_sentences_infer: 10 + no_repeat_ngram_size: 2 + temperature: 0.7 + top_p: 0.92 + train_batch_size: 4 + eval_batch_size: 32 + inference_batch_size: 16 + source_max_length: 512 + target_max_length: 100 + learning_rate: 3.0e-5 + eps: 1.0e-08 + seed: 13 + +lightning_params: + num_gpus: 1 + profiler: True + max_epochs: 1 + accumulate_grad_batches: 16 + check_val_every_n_epoch: 1 + progress_bar_refresh_rate: 1 + gradient_clip_val: 1.0 + fast_dev_run: False + +early_stop_callback: + monitor: 'avg_train_loss' + min_delta: 0.01 + patience: 1 + verbose: False + mode: 'min' \ No newline at end of file diff --git a/tasks/nlp-question-generator/select_queries.py b/tasks/nlp-question-generator/select_queries.py new file mode 100644 index 00000000..fa9dc437 --- /dev/null +++ b/tasks/nlp-question-generator/select_queries.py @@ -0,0 +1,83 @@ +import os +import pandas as pd +from vident.io_utils import IO_Utils +import faiss + +from vident.document_retriever.sparse_similarity.similarity import TfidfVectorizer + +from sklearn.cluster import KMeans +from sklearn import metrics +import matplotlib.pyplot as plt + +#import faiss +import numpy as np + + +class FaissKMeans: + def __init__(self, n_clusters=8, n_init=10, max_iter=300): + self.n_clusters = n_clusters + self.n_init = n_init + self.max_iter = max_iter + self.kmeans = None + self.cluster_centers_ = None + self.inertia_ = None + + def fit(self, X, y): + self.kmeans = faiss.Kmeans(d=X.shape[1], + k=self.n_clusters, + niter=self.max_iter, + nredo=self.n_init) + self.kmeans.train(X.astype(np.float32)) + self.cluster_centers_ = self.kmeans.centroids + self.inertia_ = self.kmeans.obj[-1] + + def predict(self, X): + return self.kmeans.index.search(X.astype(np.float32), 1)[1] + + +if __name__ == '__main__': + root_dir = os.path.join(os.path.abspath(os.getcwd()).replace('= ','')) + data_dir = os.path.join(root_dir,"data") + qgenerator_dir = os.path.join(data_dir,'qgenerator') + context_questions_map_path = os.path.join(qgenerator_dir,'context_questions_map.json') + + io_utils = IO_Utils() + context_questions_map = io_utils.read_json(filepath=context_questions_map_path) + # import pdb;pdb.set_trace() + example_context = context_questions_map['9']['context'] + # '4.6 Análise Estatística: Os dados obtidos foram submetidos ao estudo da homogeneidade da variância (para estabilizar ou reduzir a variabilidade existente) através do método Box-Cox contido no PROC TRANSREG do Sistema SAS. Como para valores nulos a família de transformações de Box-Cox fica restrita, utilizou-se a variável somada a uma constante (+1.0). fitotoxicidade aos 6 e 18 DAAreinfestação de trapoeraba aos 41 DAAfoi sugerida a transformação dos dados com valor de lambda (+0.0), (-3.0) e (+0.0), respectivamente. Após a transformação dos dados as variáveis fitotoxicidade aos 18 DAAreinfestação de trapoeraba aos 41 DAAnão apresentaram distribuição normal, portanto para estabilizar a variabilidade dos tratamentos foi utilizada a estatística não paramétrica, através do Teste de Friedman. Os dados, então, foram submetidos a análise de variância, sendo a comparação das médias quando significativas realizadas pelo teste LSD ao nível de 5 % de probabilidade. Para a análise dos dados foi utilizado o software SAS.' + example_questions = context_questions_map['9']['questions'] + # ['Quando foi utilizado o método Box-Cox?', + # 'Qual foi o critério utilizado para analisar os dados para a análise de variância?', + # 'Qual foi a frequência do teste paramétrica?', + # 'Em que nível a análise estatística é realizada?', + # 'Qual foi o valor de lambda usado para estabilizar a variabilidade dos tratamentos?', + # 'Quantos dados foram submetidos a análise de variância?', + # 'Qual foi a função de equação usada para estabilizar a variabilidade dos tratamentos?', + # 'O que foi adicionado para estabilizar a variabilidade dos tratamentos?', + # 'Quantos tratamentos foram submetidos ao estudo da homogeneidade da variância?', + # 'Qual foi a análise de variância?'] + + print('example_questions:\n',example_questions) + + vectorizer = TfidfVectorizer() + vectorizer.fit(example_questions) + vec = vectorizer.transform(example_questions) + X = vec.toarray() + + + kmeans = KMeans(n_clusters=2) + kmeans.fit(X) + y_kmeans = kmeans.predict(X) + + print(kmeans.cluster_centers_) + + print(kmeans.labels_) + + #plt.scatter(X[:,0],X[:,1], c=kmeans.labels_, cmap='rainbow') + + plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis') + + centers = kmeans.cluster_centers_ + plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5) + plt.show() \ No newline at end of file diff --git a/tests/datasets.py b/tests/datasets.py index 254c53da..509d380c 100644 --- a/tests/datasets.py +++ b/tests/datasets.py @@ -292,7 +292,7 @@ def report_contexts_test_data(): return data -def document_reader_test_data(): +def report_contexts_test_data(): data = { "data": { "ndarray": [["Na região da Amazônia, o fertilizante foliar Ômega utilizado na dessecação pré-colheita do feijão, var. Imperador Vermelho, realizado com 84,2 porcento de vagens maduras, resultou em dessecação eficaz (97%) quando utilizado na dose de 4.000 mL.ha-1 , com desempenho similar a Trunfo (1.500 mL.ha-1 ) e Reglone (2.000 mL.ha-1 ). O aumento da dose de Ômega para 5.000 ou 10.000 mL.ha-1 não resultou em diferença significativa na dessecação das plantas (Figura 6);","Qual o resultado da utilização do fertilizante foliar Ômega, quando utilizado cerrado?"], diff --git a/tests/test_nlp_dense_document_retriever.py b/tests/test_nlp_dense_document_retriever.py index a6580ae8..7de85537 100644 --- a/tests/test_nlp_dense_document_retriever.py +++ b/tests/test_nlp_dense_document_retriever.py @@ -50,11 +50,7 @@ def test_experiment_report_contexts(self): "/dev/null", ) data = datasets.report_contexts_test_data() - print("######################################################") - print(data) with server.Server() as s: - print("######################################################") - print("s") response = s.test(data=data,timeout=10) ndarray = response["ndarray"] self.assertEqual(len(ndarray[0]), 4) # 1 feature \ No newline at end of file diff --git a/tests/test_nlp_document_reader.py b/tests/test_nlp_document_reader.py index ceff5c01..d4ce743f 100644 --- a/tests/test_nlp_document_reader.py +++ b/tests/test_nlp_document_reader.py @@ -73,7 +73,7 @@ def test_experiment(self): "/dev/null", ) - data = datasets.document_reader_test_data() + data = datasets.report_contexts_test_data() with server.Server() as s: response = s.test(data=data) diff --git a/tests/test_nlp_question_generator.py b/tests/test_nlp_question_generator.py new file mode 100644 index 00000000..f175f2c2 --- /dev/null +++ b/tests/test_nlp_question_generator.py @@ -0,0 +1,89 @@ +import os +import unittest +import uuid + +import papermill + +from tests import datasets, server + +EXPERIMENT_ID = str(uuid.uuid4()) +OPERATOR_ID = str(uuid.uuid4()) +RUN_ID = str(uuid.uuid4()) + +class TestQuestionGenerator(unittest.TestCase): + + def setUp(self): + # Set environment variables needed to run notebooks + os.environ["EXPERIMENT_ID"] = EXPERIMENT_ID + os.environ["OPERATOR_ID"] = OPERATOR_ID + os.environ["RUN_ID"] = RUN_ID + + datasets.reports_contexts_small() + + os.chdir("tasks/nlp-question-generator") + + def tearDown(self): + datasets.clean() + os.chdir("../../") + + def test_experiment(self): + + papermill.execute_notebook( + "Experiment.ipynb", + "/dev/null", + parameters=dict( + dataset="/tmp/data/reports_contexts_small.csv", + column_context = "context", + column_question = "question", + column_answer_start = "answer_start", + column_answer_end= "answer_end", + train_from_zero = False, + train_from_squad = False, + expand_context = True, + dev_size_from_data= 0.2, + test_size_from_dev= 0.5, + model_name= "unicamp-dl/ptt5-base-portuguese-vocab", + PREFIX = "gerador_perguntas:", + num_gen_sentences = 2, + infer_num_gen_sentences = 10, + train_batch_size= 2, + eval_batch_size= 8, + infer_batch_size = 8, + no_repeat_ngram_size= 2, + temperature= 0.7, + top_p= 0.92, + source_max_length= 512, + target_max_length= 100, + learning_rate= 3.0e-5, + eps= 1.0e-08, + seed = 13, + num_gpus= 1, + profiler= True, + max_epochs= 1, + accumulate_grad_batches= 16, + check_val_every_n_epoch= 1, + progress_bar_refresh_rate= 1, + gradient_clip_val= 1.0, + fast_dev_run= False, + monitor= 'avg_train_loss', + min_delta= 0.01, + patience= 1, + verbose= False, + mode= 'min' + ), + ) + + papermill.execute_notebook( + "Deployment.ipynb", + "/dev/null", + ) + + data = datasets.report_contexts_test_data() + + with server.Server() as s: + response = s.test(data=data) + + names = response["names"] + ndarray = response["ndarray"] + self.assertEqual(len(ndarray[0]), 4) + self.assertEqual(len(names), 4) \ No newline at end of file