Skip to content

Commit

Permalink
Upgraded to use AutoModel and AutoTokeizer for ContextualWordEmbsAug,…
Browse files Browse the repository at this point in the history
… ContextualWordEmbsForSentenceAug and AbstSummAug. Fix #133. #105
  • Loading branch information
makcedward committed Aug 23, 2020
1 parent d766248 commit 10fa3ff
Show file tree
Hide file tree
Showing 12 changed files with 119 additions and 70 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ pip install librosa>=0.7.1
## Recent Changes

### 1.0.0dev Sep, 2020
*
* Upgraded to use AutoModel and AutoTokeizer for ContextualWordEmbsAug, ContextualWordEmbsForSentenceAug and AbstSummAug. Fix [#133](https://github.com/makcedward/nlpaug/issues/133), [#105]((https://github.com/makcedward/nlpaug/issues/105))

See [changelog](https://github.com/makcedward/nlpaug/blob/master/CHANGE.md) for more details.

Expand Down
17 changes: 10 additions & 7 deletions nlpaug/augmenter/sentence/context_word_embs_sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


def init_context_word_embs_sentence_model(model_path, device, force_reload=False, temperature=1.0, top_k=None,
top_p=None, optimize=None):
top_p=None, optimize=None, silence=True):
global CONTEXT_WORD_EMBS_SENTENCE_MODELS

model_name = os.path.basename(model_path)
Expand All @@ -25,10 +25,10 @@ def init_context_word_embs_sentence_model(model_path, device, force_reload=False

if 'xlnet' in model_path:
model = nml.XlNet(model_path, device=device, temperature=temperature, top_k=top_k, top_p=top_p,
optimize=optimize)
optimize=optimize, silence=True)
elif 'gpt2' in model_path:
model = nml.Gpt2(model_path, device=device, temperature=temperature, top_k=top_k, top_p=top_p,
optimize=optimize)
optimize=optimize, silence=True)
else:
raise ValueError('Model name value is unexpected. Only support XLNet and GPT2 model.')

Expand Down Expand Up @@ -57,6 +57,8 @@ class ContextualWordEmbsForSentenceAug(SentenceAugmenter):
`external_memory`: Persisting previous computed result for next prediction. Extra memory will be used in order
to have shorter inference time. `gpt2` and `distilgpt2`are supported.
:param bool include_detail: Change detail will be returned if it is True.
:param bool silence: Default is True. transformers library will print out warning message when leveraing
pre-trained model. Set True to disable the expected warning message.
:param str name: Name of this augmenter
>>> import nlpaug.augmenter.sentence as nas
Expand All @@ -65,19 +67,20 @@ class ContextualWordEmbsForSentenceAug(SentenceAugmenter):

def __init__(self, model_path='distilgpt2', temperature=1.0, top_k=100, top_p=None,
name='ContextualWordEmbsForSentence_Aug',
device=None, force_reload=False, optimize=None, include_detail=False, verbose=0):
device=None, force_reload=False, optimize=None, include_detail=False, verbose=0, silence=True):
super().__init__(
action=Action.INSERT, name=name, tokenizer=None, stopwords=None, device=device,
include_detail=include_detail, verbose=verbose)
self.model_path = model_path
self.temperature = temperature
self.top_k = top_k
self.top_p = top_p
self.silence = silence

self._init()
self.model = self.get_model(
model_path=model_path, device=device, force_reload=force_reload, temperature=temperature, top_k=top_k,
top_p=top_p, optimize=optimize)
top_p=top_p, optimize=optimize, silence=silence)
self.device = self.model.device

def _init(self):
Expand Down Expand Up @@ -142,6 +145,6 @@ def insert(self, data):

@classmethod
def get_model(cls, model_path, device='cuda', force_reload=False, temperature=1.0, top_k=None, top_p=0.0,
optimize=None):
optimize=None, silence=True):
return init_context_word_embs_sentence_model(model_path, device, force_reload, temperature, top_k, top_p,
optimize=optimize)
optimize=optimize, silence=silence)
30 changes: 17 additions & 13 deletions nlpaug/augmenter/word/context_word_embs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


def init_context_word_embs_model(model_path, device, force_reload=False, temperature=1.0, top_k=None, top_p=None,
optimize=None):
optimize=None, silence=True):
global CONTEXT_WORD_EMBS_MODELS

model_name = os.path.basename(model_path)
Expand All @@ -23,14 +23,15 @@ def init_context_word_embs_model(model_path, device, force_reload=False, tempera
CONTEXT_WORD_EMBS_MODELS[model_name].top_p = top_p
return CONTEXT_WORD_EMBS_MODELS[model_name]

if 'distilbert' in model_path:
model = nml.DistilBert(model_path, device=device, temperature=temperature, top_k=top_k, top_p=top_p)
elif 'roberta' in model_path:
model = nml.Roberta(model_path, device=device, temperature=temperature, top_k=top_k, top_p=top_p)
elif 'bert' in model_path:
model = nml.Bert(model_path, device=device, temperature=temperature, top_k=top_k, top_p=top_p)
elif 'xlnet' in model_path:
model = nml.XlNet(model_path, device=device, temperature=temperature, top_k=top_k, top_p=top_p, optimize=optimize)
if 'distilbert' in model_path.lower():
model = nml.DistilBert(model_path, device=device, temperature=temperature, top_k=top_k, top_p=top_p, silence=silence)
elif 'roberta' in model_path.lower():
model = nml.Roberta(model_path, device=device, temperature=temperature, top_k=top_k, top_p=top_p, silence=silence)
elif 'bert' in model_path.lower():
model = nml.Bert(model_path, device=device, temperature=temperature, top_k=top_k, top_p=top_p, silence=silence)
elif 'xlnet' in model_path.lower():
model = nml.XlNet(model_path, device=device, temperature=temperature, top_k=top_k, top_p=top_p, optimize=optimize,
silence=silence)
else:
raise ValueError('Model name value is unexpected. Only support BERT, DistilBERT, RoBERTa and XLNet model.')

Expand Down Expand Up @@ -70,6 +71,8 @@ class ContextualWordEmbsAug(WordAugmenter):
:param bool optimize: If true, optimized process will be executed. For example, GPT2 will use "return_past" to
reduce inference time.
:param bool include_detail: Change detail will be returned if it is True.
:param bool silence: Default is True. transformers library will print out warning message when leveraing
pre-trained model. Set True to disable the expected warning message.
:param str name: Name of this augmenter
>>> import nlpaug.augmenter.word as naw
Expand All @@ -79,7 +82,7 @@ class ContextualWordEmbsAug(WordAugmenter):
def __init__(self, model_path='bert-base-uncased', action="substitute", temperature=1.0, top_k=100, top_p=None,
name='ContextualWordEmbs_Aug', aug_min=1, aug_max=10, aug_p=0.3, stopwords=None,
skip_unknown_word=False, device=None, force_reload=False, optimize=None, stopwords_regex=None,
verbose=0, include_detail=False):
verbose=0, include_detail=False, silence=True):
super().__init__(
action=action, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, tokenizer=None,
device=device, stopwords=stopwords, verbose=verbose, stopwords_regex=stopwords_regex,
Expand All @@ -89,11 +92,12 @@ def __init__(self, model_path='bert-base-uncased', action="substitute", temperat
self.temperature = temperature
self.top_k = top_k
self.top_p = top_p
self.silence = silence

self._init()
self.model = self.get_model(
model_path=model_path, device=device, force_reload=force_reload, temperature=temperature, top_k=top_k,
top_p=top_p, optimize=optimize)
top_p=top_p, optimize=optimize, silence=silence)
# Override stopwords
if stopwords is not None and self.model_type in ['xlnet', 'roberta']:
stopwords = [self.stopwords]
Expand Down Expand Up @@ -322,5 +326,5 @@ def substitute(self, data):

@classmethod
def get_model(cls, model_path, device='cuda', force_reload=False, temperature=1.0, top_k=None, top_p=0.0,
optimize=None):
return init_context_word_embs_model(model_path, device, force_reload, temperature, top_k, top_p, optimize)
optimize=None, silence=True):
return init_context_word_embs_model(model_path, device, force_reload, temperature, top_k, top_p, optimize, silence)
16 changes: 12 additions & 4 deletions nlpaug/model/lang_models/bart.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging

try:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
except ImportError:
# No installation required if not using this function
pass
Expand All @@ -12,8 +14,9 @@

class Bart(LanguageModels):
# https://arxiv.org/pdf/1910.13461.pdf
def __init__(self, model_path='facebook/bart-large-cnn', min_length=10, max_length=20, num_beam=3, no_repeat_ngram_size=3, device='cuda'):
super().__init__(device, temperature=None, top_k=None, top_p=None)
def __init__(self, model_path='facebook/bart-large-cnn', min_length=10, max_length=20, num_beam=3, no_repeat_ngram_size=3,
device='cuda', silence=True):
super().__init__(device, temperature=None, top_k=None, top_p=None, silence=True)
try:
import transformers
except ModuleNotFoundError:
Expand All @@ -26,7 +29,12 @@ def __init__(self, model_path='facebook/bart-large-cnn', min_length=10, max_leng
self.no_repeat_ngram_size = no_repeat_ngram_size

self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
if silence:
# Transformers thrown an warning regrading to weight initialization. It is expected
orig_log_level = logging.getLogger('transformers.' + 'modeling_utils').getEffectiveLevel()
logging.getLogger('transformers.' + 'modeling_utils').setLevel(logging.ERROR)
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
logging.getLogger('transformers.' + 'modeling_utils').setLevel(orig_log_level)

self.model.to(self.device)
self.model.eval()
Expand Down
20 changes: 12 additions & 8 deletions nlpaug/model/lang_models/bert.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging

try:
import torch
from transformers import BertTokenizer, BertForMaskedLM
# from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForMaskedLM, AutoTokenizer
except ImportError:
# No installation required if not using this function
pass
Expand All @@ -17,19 +18,22 @@ class Bert(LanguageModels):
MASK_TOKEN = '[MASK]'
SUBWORD_PREFIX = '##'

def __init__(self, model_path='bert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'):
super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
def __init__(self, model_path='bert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda', silence=True):
super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p, silence=silence)
try:
import transformers
except ModuleNotFoundError:
raise ModuleNotFoundError('Missed transformers library. Install transfomers by `pip install transformers`')

self.model_path = model_path

# self.tokenizer = AutoTokenizer.from_pretrained(model_path)
# self.model = AutoModel.from_pretrained(model_path)
self.tokenizer = BertTokenizer.from_pretrained(model_path)
self.model = BertForMaskedLM.from_pretrained(model_path)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
if silence:
# Transformers thrown an warning regrading to weight initialization. It is expected
orig_log_level = logging.getLogger('transformers.' + 'modeling_utils').getEffectiveLevel()
logging.getLogger('transformers.' + 'modeling_utils').setLevel(logging.ERROR)
self.model = AutoModelForMaskedLM.from_pretrained(model_path)
logging.getLogger('transformers.' + 'modeling_utils').setLevel(orig_log_level)

self.model.to(self.device)
self.model.eval()
Expand Down
20 changes: 12 additions & 8 deletions nlpaug/model/lang_models/distilbert.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging

try:
import torch
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
# from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForMaskedLM, AutoTokenizer
except ImportError:
# No installation required if not using this function
pass
Expand All @@ -17,19 +18,22 @@ class DistilBert(LanguageModels):
MASK_TOKEN = '[MASK]'
SUBWORD_PREFIX = '##'

def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'):
super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda', silence=True):
super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p, silence=True)
try:
import transformers
except ModuleNotFoundError:
raise ModuleNotFoundError('Missed transformers library. Install transfomers by `pip install transformers`')

self.model_path = model_path

# self.tokenizer = AutoTokenizer.from_pretrained(model_path)
# self.model = AutoModel.from_pretrained(model_path)
self.tokenizer = DistilBertTokenizer.from_pretrained(model_path)
self.model = DistilBertForMaskedLM.from_pretrained(model_path)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
if silence:
# Transformers thrown an warning regrading to weight initialization. It is expected
orig_log_level = logging.getLogger('transformers.' + 'modeling_utils').getEffectiveLevel()
logging.getLogger('transformers.' + 'modeling_utils').setLevel(logging.ERROR)
self.model = AutoModelForMaskedLM.from_pretrained(model_path)
logging.getLogger('transformers.' + 'modeling_utils').setLevel(orig_log_level)

self.model.to(self.device)
self.model.eval()
Expand Down
3 changes: 2 additions & 1 deletion nlpaug/model/lang_models/fairseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@


class Fairseq(LanguageModels):
def __init__(self, from_model_name, from_model_checkpt, to_model_name, to_model_checkpt, tokenzier_name='moses', bpe_name='fastbpe', device='cuda'):
def __init__(self, from_model_name, from_model_checkpt, to_model_name, to_model_checkpt, tokenzier_name='moses', bpe_name='fastbpe',
device='cuda'):
super().__init__(device, temperature=None, top_k=None, top_p=None)

try:
Expand Down
21 changes: 12 additions & 9 deletions nlpaug/model/lang_models/gpt2.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging

try:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# from transformers import AutoModel, AutoTokenizer # Thrown error when using nucleus sampling
from transformers import AutoModelForCausalLM, AutoTokenizer
except ImportError:
# No installation required if not using this function
pass
Expand All @@ -13,20 +14,22 @@ class Gpt2(LanguageModels):
# https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf
SUBWORD_PREFIX = 'Ġ'

def __init__(self, model_path='gpt2', temperature=1.0, top_k=None, top_p=None, device=None, optimize=None):
super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p, optimize=optimize)
def __init__(self, model_path='gpt2', temperature=1.0, top_k=None, top_p=None, device=None, optimize=None, silence=True):
super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p, optimize=optimize, silence=True)
try:
import transformers
except ModuleNotFoundError:
raise ModuleNotFoundError('Missed transformers library. Install transfomers by `pip install transformers`')

self.model_path = model_path

# self.tokenizer = AutoTokenizer.from_pretrained(model_path)
# self.model = AutoModel.from_pretrained(model_path)

self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
self.model = GPT2LMHeadModel.from_pretrained(model_path)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
if silence:
# Transformers thrown an warning regrading to weight initialization. It is expected
orig_log_level = logging.getLogger('transformers.' + 'modeling_utils').getEffectiveLevel()
logging.getLogger('transformers.' + 'modeling_utils').setLevel(logging.ERROR)
self.model = AutoModelForCausalLM.from_pretrained(model_path)
logging.getLogger('transformers.' + 'modeling_utils').setLevel(orig_log_level)

self.model.to(self.device)
self.model.eval()
Expand Down
3 changes: 2 additions & 1 deletion nlpaug/model/lang_models/language_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
class LanguageModels:
OPTIMIZE_ATTRIBUTES = ['external_memory', 'return_proba']

def __init__(self, device=None, temperature=1.0, top_k=100, top_p=0.01, optimize=None):
def __init__(self, device=None, temperature=1.0, top_k=100, top_p=0.01, optimize=None, silence=True):
try:
import torch
except ModuleNotFoundError:
Expand All @@ -23,6 +23,7 @@ def __init__(self, device=None, temperature=1.0, top_k=100, top_p=0.01, optimize
self.top_k = top_k
self.top_p = top_p
self.optimize = self.init_optimize(optimize)
self.silence = silence

@classmethod
def get_default_optimize_config(cls):
Expand Down
20 changes: 12 additions & 8 deletions nlpaug/model/lang_models/roberta.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging

try:
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM
# from transformers import AutoModel, AutoTokenizer # Thrown error when using nucleus sampling
from transformers import AutoModelForMaskedLM, AutoTokenizer
except ImportError:
# No installation required if not using this function
pass
Expand All @@ -17,19 +18,22 @@ class Roberta(LanguageModels):
MASK_TOKEN = '<mask>'
SUBWORD_PREFIX = 'Ġ'

def __init__(self, model_path='roberta-base', temperature=1.0, top_k=None, top_p=None, device='cuda'):
super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
def __init__(self, model_path='roberta-base', temperature=1.0, top_k=None, top_p=None, device='cuda', silence=True):
super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p, silence=True)
try:
import transformers
except ModuleNotFoundError:
raise ModuleNotFoundError('Missed transformers library. Install transfomers by `pip install transformers`')

self.model_path = model_path

# self.tokenizer = AutoTokenizer.from_pretrained(model_path)
# self.model = AutoModel.from_pretrained(model_path)
self.tokenizer = RobertaTokenizer.from_pretrained(model_path)
self.model = RobertaForMaskedLM.from_pretrained(model_path)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
if silence:
# Transformers thrown an warning regrading to weight initialization. It is expected
orig_log_level = logging.getLogger('transformers.' + 'modeling_utils').getEffectiveLevel()
logging.getLogger('transformers.' + 'modeling_utils').setLevel(logging.ERROR)
self.model = AutoModelForMaskedLM.from_pretrained(model_path)
logging.getLogger('transformers.' + 'modeling_utils').setLevel(orig_log_level)

self.model.to(self.device)
self.model.eval()
Expand Down
Loading

0 comments on commit 10fa3ff

Please sign in to comment.