diff --git a/CHANGELOG.md b/CHANGELOG.md index ae83d17db07f..9d6494123af0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -104,6 +104,10 @@ files, along with unit tests, examples and tutorials - Added TRADE (dialogue state tracking model) on MultiWOZ dataset ([PR #322](https://github.com/NVIDIA/NeMo/pull/322)) - @chiphuyen, @VahidooX +- Question answering: +([PR #390](https://github.com/NVIDIA/NeMo/pull/390)) - @yzhang123 + - Changed question answering task to use Roberta and Albert as alternative backends to Bert + - Added inference mode that does not require ground truth labels ### Dependencies Update - Added dependency on `wrapt` (the new version of the `deprecated` warning) - @tkornuta-nvidia, @DEKHTIARJonathan diff --git a/examples/nlp/question_answering/question_answering_squad.py b/examples/nlp/question_answering/question_answering_squad.py index 91f896acf7af..5997614888e7 100755 --- a/examples/nlp/question_answering/question_answering_squad.py +++ b/examples/nlp/question_answering/question_answering_squad.py @@ -37,6 +37,7 @@ --weight_decay 0.0 --lr 3e-5 --do_lower_case +--mode train_eval If --bert_checkpoint is not specified, training starts from Huggingface pretrained checkpoints. @@ -55,17 +56,34 @@ --optimizer adam_w --weight_decay 0.0 --lr 3e-5 +--mode train_eval --do_lower_case On Huggingface the final Exact Match (EM) and F1 scores are as follows: Model EM F1 BERT Based uncased 80.59 88.34 BERT Large uncased 83.88 90.65 + +To run only evaluation on pretrained question answering checkpoints on 1 GPU with ground-truth data: +python question_answering_squad.py +--dev_file /path_to_data_dir/infer.json +--checkpoint_dir /path_to_checkpoints +--do_lower_case +--mode eval + +To run only inference on pretrained question answering checkpoints on 1 GPU without ground-truth data: +python question_answering_squad.py +--infer_file /path_to_data_dir/infer.json +--checkpoint_dir /path_to_checkpoints +--do_lower_case +--mode infer """ import argparse import json import os +import numpy as np + import nemo.collections.nlp as nemo_nlp import nemo.core as nemo_core from nemo import logging @@ -79,7 +97,12 @@ def parse_args(): "--train_file", type=str, help="The training data file. Should be *.json", ) parser.add_argument( - "--dev_file", type=str, required=True, help="The evaluation data file. Should be *.json", + "--dev_file", type=str, help="The evaluation data file. Should be *.json", + ) + parser.add_argument( + "--infer_file", + type=str, + help="The inference data file. Should be *.json. Does not need to contain ground truth", ) parser.add_argument("--pretrained_model_name", type=str, help="Name of the pre-trained model") parser.add_argument("--checkpoint_dir", default=None, type=str, help="Checkpoint directory for inference.") @@ -115,7 +138,9 @@ def parse_args(): action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.", ) - parser.add_argument("--evaluation_only", action='store_true', help="Whether to only do evaluation.") + parser.add_argument( + "--mode", default="train_eval", choices=["train_eval", "eval", "infer"], help="Mode of model usage." + ) parser.add_argument( "--no_data_cache", action='store_true', help="When specified do not load and store cache preprocessed data.", ) @@ -209,15 +234,15 @@ def create_pipeline( data_file, model, head, - loss_fn, max_query_length, max_seq_length, doc_stride, batch_size, version_2_with_negative, + mode, num_gpus=1, batches_per_step=1, - mode="train", + loss_fn=None, use_data_cache=True, ): data_layer = nemo_nlp.nm.data_layers.BertQuestionAnsweringDataLayer( @@ -239,17 +264,26 @@ def create_pipeline( ) qa_output = head(hidden_states=hidden_states) - loss_output = loss_fn( - logits=qa_output, start_positions=input_data.start_positions, end_positions=input_data.end_positions - ) steps_per_epoch = len(data_layer) // (batch_size * num_gpus * batches_per_step) - return ( - loss_output.loss, - steps_per_epoch, - [loss_output.start_logits, loss_output.end_logits, input_data.unique_ids], - data_layer, - ) + + if mode == "infer": + return ( + steps_per_epoch, + [input_data.unique_ids, qa_output], + data_layer, + ) + else: + loss_output = loss_fn( + logits=qa_output, start_positions=input_data.start_positions, end_positions=input_data.end_positions + ) + + return ( + loss_output.loss, + steps_per_epoch, + [input_data.unique_ids, loss_output.start_logits, loss_output.end_logits], + data_layer, + ) MODEL_CLASSES = { @@ -261,14 +295,24 @@ def create_pipeline( if __name__ == "__main__": args = parse_args() - if not os.path.exists(args.dev_file): - raise FileNotFoundError( - "eval data not found. Datasets can be obtained using examples/nlp/scripts/get_squad.py" - ) - if not args.evaluation_only and not os.path.exists(args.train_file): - raise FileNotFoundError( - "train data not found. Datasets can be obtained using examples/nlp/scripts/get_squad.py" - ) + + if args.mode == "train_eval": + if not os.path.exists(args.train_file) or not os.path.exists(args.dev_file): + raise FileNotFoundError( + "train and dev data not found. Datasets can be obtained using examples/nlp/scripts/get_squad.py" + ) + elif args.mode == "eval": + if not os.path.exists(args.dev_file): + raise FileNotFoundError( + "dev data not found. Datasets can be obtained using examples/nlp/scripts/get_squad.py" + ) + elif args.mode == "infer": + if not os.path.exists(args.infer_file): + raise FileNotFoundError( + "infer data not found. Datasets can be obtained using examples/nlp/scripts/get_squad.py" + ) + else: + raise ValueError(f"{args.mode} can only be one of [train_eval, eval, infer]") # Instantiate neural factory with supported backend nf = nemo_core.NeuralModuleFactory( @@ -328,7 +372,7 @@ def create_pipeline( if args.bert_checkpoint is not None: model.restore_from(args.bert_checkpoint) - if not args.evaluation_only: + if "train" in args.mode: train_loss, train_steps_per_epoch, _, _ = create_pipeline( data_file=args.train_file, model=model, @@ -344,24 +388,39 @@ def create_pipeline( mode="train", use_data_cache=not args.no_data_cache, ) - logging.info(f"training step per epoch: {train_steps_per_epoch}") - _, _, eval_output, eval_data_layer = create_pipeline( - data_file=args.dev_file, - model=model, - head=qa_head, - loss_fn=squad_loss, - max_query_length=args.max_query_length, - max_seq_length=args.max_seq_length, - doc_stride=args.doc_stride, - batch_size=args.batch_size, - version_2_with_negative=args.version_2_with_negative, - num_gpus=args.num_gpus, - batches_per_step=args.batches_per_step, - mode="dev", - use_data_cache=not args.no_data_cache, - ) + if "eval" in args.mode: + _, _, eval_output, eval_data_layer = create_pipeline( + data_file=args.dev_file, + model=model, + head=qa_head, + loss_fn=squad_loss, + max_query_length=args.max_query_length, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + batch_size=args.batch_size, + version_2_with_negative=args.version_2_with_negative, + num_gpus=args.num_gpus, + batches_per_step=args.batches_per_step, + mode="dev", + use_data_cache=not args.no_data_cache, + ) + if "infer" in args.mode: + _, eval_output, infer_data_layer = create_pipeline( + data_file=args.infer_file, + model=model, + head=qa_head, + max_query_length=args.max_query_length, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + batch_size=args.batch_size, + version_2_with_negative=args.version_2_with_negative, + num_gpus=args.num_gpus, + batches_per_step=args.batches_per_step, + mode="infer", + use_data_cache=not args.no_data_cache, + ) - if not args.evaluation_only: + if args.mode == "train_eval": logging.info(f"steps_per_epoch = {train_steps_per_epoch}") callback_train = nemo_core.SimpleLossLoggerCallback( tensors=[train_loss], @@ -402,33 +461,52 @@ def create_pipeline( batches_per_step=args.batches_per_step, optimization_params={"num_epochs": args.num_epochs, "lr": args.lr}, ) - else: + else: + load_from_folder = None if args.checkpoint_dir is not None: load_from_folder = args.checkpoint_dir + evaluated_tensors = nf.infer(tensors=eval_output, checkpoint_dir=load_from_folder, cache=True) unique_ids = [] - start_logits = [] - end_logits = [] - for t in evaluated_tensors[2]: - unique_ids.extend(t.tolist()) for t in evaluated_tensors[0]: - start_logits.extend(t.tolist()) - for t in evaluated_tensors[1]: - end_logits.extend(t.tolist()) - - exact_match, f1, all_predictions = eval_data_layer.dataset.evaluate( - unique_ids=unique_ids, - start_logits=start_logits, - end_logits=end_logits, - n_best_size=args.n_best_size, - max_answer_length=args.max_answer_length, - version_2_with_negative=args.version_2_with_negative, - null_score_diff_threshold=args.null_score_diff_threshold, - do_lower_case=args.do_lower_case, - ) + unique_ids.extend(t.tolist()) + if "eval" in args.mode: + start_logits = [] + end_logits = [] + for t in evaluated_tensors[1]: + start_logits.extend(t.tolist()) + for t in evaluated_tensors[2]: + end_logits.extend(t.tolist()) + + exact_match, f1, all_predictions = eval_data_layer.dataset.evaluate( + unique_ids=unique_ids, + start_logits=start_logits, + end_logits=end_logits, + n_best_size=args.n_best_size, + max_answer_length=args.max_answer_length, + version_2_with_negative=args.version_2_with_negative, + null_score_diff_threshold=args.null_score_diff_threshold, + do_lower_case=args.do_lower_case, + ) - logging.info(f"exact_match: {exact_match}, f1: {f1}") + logging.info(f"exact_match: {exact_match}, f1: {f1}") + + elif "infer" in args.mode: + logits = [] + for t in evaluated_tensors[1]: + logits.extend(t.tolist()) + start_logits, end_logits = np.split(np.asarray(logits), 2, axis=-1) + (all_predictions, all_nbest_json, scores_diff_json) = infer_data_layer.dataset.get_predictions( + unique_ids=unique_ids, + start_logits=start_logits, + end_logits=end_logits, + n_best_size=args.n_best_size, + max_answer_length=args.max_answer_length, + version_2_with_negative=args.version_2_with_negative, + null_score_diff_threshold=args.null_score_diff_threshold, + do_lower_case=args.do_lower_case, + ) if args.output_prediction_file is not None: with open(args.output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") diff --git a/nemo/collections/nlp/data/datasets/qa_squad_dataset.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset.py index 6b33c4a581b4..b02d0036dc18 100644 --- a/nemo/collections/nlp/data/datasets/qa_squad_dataset.py +++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset.py @@ -86,8 +86,8 @@ def __init__( self.version_2_with_negative = version_2_with_negative self.processor = SquadProcessor(data_file=data_file, mode=mode) self.mode = mode - if mode != "dev" and mode != "train": - raise ValueError(f"mode should be either 'train' or 'dev' but got {mode}") + if mode not in ["dev", "train", "infer"]: + raise ValueError(f"mode should be either 'train', 'dev', or 'infer' but got {mode}") self.examples = self.processor.get_examples() cached_features_file = ( @@ -107,7 +107,7 @@ def __init__( max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, - has_groundtruth=True, + has_groundtruth=mode != "infer", ) if use_cache: @@ -122,14 +122,22 @@ def __len__(self): def __getitem__(self, idx): feature = self.features[idx] - return ( - np.array(feature.input_ids), - np.array(feature.segment_ids), - np.array(feature.input_mask), - np.array(feature.start_position), - np.array(feature.end_position), - np.array(feature.unique_id), - ) + if self.mode == "infer": + return ( + np.array(feature.input_ids), + np.array(feature.segment_ids), + np.array(feature.input_mask), + np.array(feature.unique_id), + ) + else: + return ( + np.array(feature.input_ids), + np.array(feature.segment_ids), + np.array(feature.input_mask), + np.array(feature.unique_id), + np.array(feature.start_position), + np.array(feature.end_position), + ) def get_predictions( self, diff --git a/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py b/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py index 096ed3a7764a..140392fb4e0b 100644 --- a/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py +++ b/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py @@ -59,9 +59,9 @@ def output_ports(self): "input_ids": NeuralType(('B', 'T'), ChannelType()), "input_type_ids": NeuralType(('B', 'T'), ChannelType()), "input_mask": NeuralType(('B', 'T'), ChannelType()), - "start_positions": NeuralType(tuple('B'), ChannelType()), - "end_positions": NeuralType(tuple('B'), ChannelType()), "unique_ids": NeuralType(tuple('B'), ChannelType()), + "start_positions": NeuralType(tuple('B'), ChannelType(), optional=True), + "end_positions": NeuralType(tuple('B'), ChannelType(), optional=True), } def __init__(