Skip to content

Commit

Permalink
Fix word piece attributes
Browse files Browse the repository at this point in the history
Attributes for config items should be public and match the argument
name.
  • Loading branch information
mattdangerw committed Apr 7, 2022
1 parent 7654761 commit 2009453
Showing 1 changed file with 30 additions and 30 deletions.
60 changes: 30 additions & 30 deletions keras_nlp/tokenizers/word_piece_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,28 +188,28 @@ def __init__(
super().__init__(**kwargs)

if isinstance(vocabulary, str):
self._vocab = [
self.vocabulary = [
line.rstrip() for line in tf.io.gfile.GFile(vocabulary)
]
elif isinstance(vocabulary, Iterable):
# Make a copy.
self._vocab = list(vocabulary)
self.vocabulary = list(vocabulary)
else:
raise ValueError(
"Vocabulary must be an file path or list of terms. "
f"Received: vocabulary={vocabulary}"
)

self._sequence_length = sequence_length
self._lowercase = lowercase
self._strip_accents = strip_accents
self._split_pattern = split_pattern
self._keep_pattern = keep_pattern
self._suffix_indicator = suffix_indicator
self._oov_token = oov_token
self.sequence_length = sequence_length
self.lowercase = lowercase
self.strip_accents = strip_accents
self.split_pattern = split_pattern
self.keep_pattern = keep_pattern
self.suffix_indicator = suffix_indicator
self.oov_token = oov_token

self._fast_word_piece = tf_text.FastWordpieceTokenizer(
vocab=self._vocab,
vocab=self.vocabulary,
token_out_type=self.compute_dtype,
suffix_indicator=suffix_indicator,
unknown_token=oov_token,
Expand All @@ -219,22 +219,22 @@ def __init__(

def get_vocabulary(self) -> List[str]:
"""Get the tokenizer vocabulary as a list of strings tokens."""
return self._vocab
return self.vocabulary

def vocabulary_size(self) -> int:
"""Get the size of the tokenizer vocabulary."""
return len(self._vocab)
return len(self.vocabulary)

def id_to_token(self, id: int) -> str:
"""Convert an integer id to a string token."""
return self._vocab[id]
return self.vocabulary[id]

def token_to_id(self, token: str) -> int:
"""Convert a string token to an integer id."""
# This will be slow, but keep memory usage down compared to building a
# dict. Assuming the main use case is looking up a few special tokens
# early in the vocab, this should be fine.
return self._vocab.index(token)
return self.vocabulary.index(token)

def get_config(self) -> Dict[str, Any]:
config = super().get_config()
Expand All @@ -243,14 +243,14 @@ def get_config(self) -> Dict[str, Any]:
# Ideally a vocabulary would be saved as a plain text asset in
# the saved model. We have no good way to support this
# currently, so we save the vocabulary in the config.
"vocabulary": self._vocab,
"sequence_length": self._sequence_length,
"lowercase": self._lowercase,
"strip_accents": self._strip_accents,
"split_pattern": self._split_pattern,
"keep_pattern": self._keep_pattern,
"suffix_indicator": self._suffix_indicator,
"oov_token": self._oov_token,
"vocabulary": self.vocabulary,
"sequence_length": self.sequence_length,
"lowercase": self.lowercase,
"strip_accents": self.strip_accents,
"split_pattern": self.split_pattern,
"keep_pattern": self.keep_pattern,
"suffix_indicator": self.suffix_indicator,
"oov_token": self.oov_token,
}
)
return config
Expand All @@ -263,18 +263,18 @@ def tokenize(self, inputs):
if scalar_input:
inputs = tf.expand_dims(inputs, 0)
# Optionally normalize and split inputs.
if self._lowercase:
if self.lowercase:
inputs = tf_text.case_fold_utf8(inputs)
if self._strip_accents:
if self.strip_accents:
# Normalize unicode to NFD, which splits out accent mark characters.
inputs = tf_text.normalize_utf8(inputs, "NFD")
# Remove the accent marks.
inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "")
if self._split_pattern:
if self.split_pattern:
inputs = tf_text.regex_split(
inputs,
delim_regex_pattern=self._split_pattern,
keep_delim_regex_pattern=self._keep_pattern,
delim_regex_pattern=self.split_pattern,
keep_delim_regex_pattern=self.keep_pattern,
)

# Apply word piece and coerce shape for outputs.
Expand All @@ -284,14 +284,14 @@ def tokenize(self, inputs):
# ragged dimension which is a better out of box default.
tokens = tokens.merge_dims(-2, -1)
# Convert to a dense output if `sequence_length` is set.
if self._sequence_length:
if self.sequence_length:
output_shape = tokens.shape.as_list()
output_shape[-1] = self._sequence_length
output_shape[-1] = self.sequence_length
tokens = tokens.to_tensor(shape=output_shape)
# Convert to a dense output if input in scalar
if scalar_input:
tokens = tf.squeeze(tokens, 0)
tf.ensure_shape(tokens, shape=[self._sequence_length])
tf.ensure_shape(tokens, shape=[self.sequence_length])

return tokens

Expand Down

0 comments on commit 2009453

Please sign in to comment.