platiagro · math-sasso · Jul 26, 2021 · Jul 29, 2021 · Aug 2, 2021 · Aug 2, 2021
diff --git a/.github/workflows/nlp-question-generator.yml b/.github/workflows/nlp-question-generator.yml
@@ -0,0 +1,54 @@
+name: Question Generator
+
+on:
+  push:
+    branches:
+    - main
+    - v*-branch
+    paths:
+    - tasks/nlp-question-generator/**
+    - tests/datasets.py
+    - tests/server.py
+    - tests/test_nlp_question_generator.py
+
+  pull_request:
+    branches:
+    - main
+    - v*-branch
+    paths:
+    - tasks/nlp-question-generator/**
+    - tests/datasets.py
+    - tests/server.py
+    - tests/test_nlp_question_generator.py
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    # runs all of the steps inside the specified container rather than on the VM host.
+    # Because of this the network configuration changes from host based network to a container network.
+    container:
+      image: platiagro/platiagro-notebook-image:0.3.0
+
+    services:
+
+      minio:
+        image: bitnami/minio:latest
+        env:
+          MINIO_ACCESS_KEY: minio
+          MINIO_SECRET_KEY: minio123
+        ports:
+        - 9000:9000
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Test with pytest
+      run: |
+        pip install pytest
+        pytest -v tests/test_nlp_question_generator.py
+      timeout-minutes: 90
+      env:
+        MINIO_ENDPOINT: minio:9000
+        MINIO_ACCESS_KEY: minio
+        MINIO_SECRET_KEY: minio123
diff --git a/README.md b/README.md
@@ -29,6 +29,7 @@ Task | Status | License
 [Sparse Document Retriever](tasks/nlp-sparse-document-retriever/) | [![Sparse Document Retriever](https://github.com/platiagro/tasks/workflows/Sparse%20Document%20Retriever/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/nlp-sparse-document-retriever.yml) | TBD
 [Dense Document Retriever](tasks/nlp-dense-document-retriever/) | [![Dense Document Retriever](https://github.com/platiagro/tasks/workflows/Dense%20Document%20Retriever/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/nlp-dense-document-retriever.yml) | TBD
 [Document Reader](tasks/nlp-document-reader/) | [![Document Reader](https://github.com/platiagro/tasks/workflows/Document%20Reader/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/nlp-document-reader.yml) | TBD
+[Question Generator](tasks/nlp-question-generator/) | [![Question Generator](https://github.com/platiagro/tasks/workflows/Question%Generator/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/nlp-question-generator.yml) | TBD
 [Normalizer](tasks/normalizer/) | [![Normalizer](https://github.com/platiagro/tasks/workflows/Normalizer/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/normalizer.yml) | [BSD 3-Clause](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
 [Pre Selection](tasks/pre-selection/) | [![Pre Selection](https://github.com/platiagro/tasks/workflows/Pre%20Selection/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/pre-selection.yml) | [BSD 3-Clause](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
 [Random Forest Classifier](tasks/random-forest-classifier/) | [![Random Forest Classifier](https://github.com/platiagro/tasks/workflows/Random%20Forest%20Classifier/badge.svg)](https://github.com/platiagro/tasks/actions/workflows/random-forest-classifier.yml) | [BSD 3-Clause](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)

diff --git a/tasks/nlp-question-generator/Deployment.ipynb b/tasks/nlp-question-generator/Deployment.ipynb
@@ -0,0 +1,155 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Question Generator - Implantação\n",
+    "\n",
+    "Utiliza um transformer T5 pré treinado em português e disponibilizado pelo [huggingfaces](https://platiagro.github.io/tutorials/).<br>\n",
+    "\n",
+    "### **Em caso de dúvidas, consulte os [tutoriais da PlatIAgro](https://platiagro.github.io/tutorials/).**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Declaração de Classe para Predições em Tempo Real\n",
+    "\n",
+    "A tarefa de implantação cria um serviço REST para predições em tempo-real.<br>\n",
+    "Para isso você deve criar uma classe `Model` que implementa o método `predict`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting Model.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile Model.py\n",
+    "import joblib\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from typing import List\n",
+    "from expander import DocExpander\n",
+    "from aux_functions import build_df_result\n",
+    "\n",
+    "\n",
+    "class Model:\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        self.loaded = False\n",
+    "        \n",
+    "        \n",
+    "    def load(self):\n",
+    "        \n",
+    "        artifacts = joblib.load(\"/tmp/data/qgenerator.joblib\")\n",
+    "        self.model = artifacts[\"model\"]\n",
+    "        self.expand_context = artifacts[\"expand_context\"]\n",
+    "        self.infer_num_gen_sentences = artifacts[\"infer_num_gen_sentences\"]\n",
+    "        self.column_context = artifacts[\"column_context\"]\n",
+    "        self.column_question = artifacts[\"column_question\"]\n",
+    "        self.loaded = True\n",
+    "\n",
+    "    def class_names(self) -> List:\n",
+    "        return ['doc_id','context','questions','expanded_context']\n",
+    "    \n",
+    "    def expand(self,df):\n",
+    "        if self.expand_context:\n",
+    "            exp = DocExpander() \n",
+    "            df_final = exp.expand_sql(df,context_column_name=self.column_context,questions_column_name = self.column_question)\n",
+    "        \n",
+    "        return df_final\n",
+    "\n",
+    "    def predict(self, X, feature_names, meta=None):\n",
+    "        \n",
+    "        if not self.loaded:\n",
+    "            self.load()\n",
+    "            \n",
+    "        feature_names_pipeline = ['doc_id', 'context']\n",
+    "        feature_names_qa = ['context']\n",
+    "        \n",
+    "        if feature_names != feature_names_pipeline and feature_names != feature_names_qa:\n",
+    "            raise ValueError(f'feature_names deve ser {feature_names_pipeline} ou {feature_names_qa}')\n",
+    "        \n",
+    "        \n",
+    "        df_input = pd.DataFrame(X,columns=feature_names)\n",
+    "        contexts = df_input['context'].to_numpy()\n",
+    "        gen_questions_dict = self.model.forward(contexts=contexts, num_gen_sentences=self.infer_num_gen_sentences)\n",
+    "        df_result = build_df_result(gen_questions_dict,column_context=self.column_context,column_question=self.column_question)\n",
+    "        df_result = self.expand(df_result)\n",
+    "        \n",
+    "        if feature_names == feature_names_pipeline:\n",
+    "            df_input = df_input[['doc_id']] \n",
+    "            df_input['index'] = df_input.index\n",
+    "            df_result['index'] = df_result.index\n",
+    "            df_result = pd.merge(df_input, df_result, on='index', how='outer')\n",
+    "            del df_result['index']\n",
+    "            \n",
+    "        return df_result.to_numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import pandas as pd\n",
+    "# df = pd.read_csv(\"squad-test-v1.1.csv\")\n",
+    "# n_lines = 10\n",
+    "# contexts = df['context'][:n_lines]\n",
+    "# indexes = df.index[:n_lines]\n",
+    "\n",
+    "# df_small = pd.DataFrame({'doc_id':indexes,'context':contexts})\n",
+    "# X = df_small.to_numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from Model import Model\n",
+    "# model = Model()\n",
+    "# result = model.predict(X,['doc_id','context'])\n",
+    "# result"
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Tags",
+  "experiment_id": "dd63cfbd-7a97-41ac-bd9b-fd11711ba459",
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  },
+  "operator_id": "e4150bc8-88f2-4d98-b68a-6c246270c403",
+  "task_id": "ccfeb3fe-3d3a-43cf-bdc4-d0b07017e468"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}