diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py b/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py index a3e4bd3a6c..eda6c989ce 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py @@ -118,6 +118,7 @@ def compute_word_piece_vocabulary( f"Received: {type(data)}." ) if isinstance(data, list): + # Processing list of file paths. if not split: raise ValueError( "When learning a vocab from files, `split` must be `True`. " @@ -125,7 +126,12 @@ def compute_word_piece_vocabulary( "data as a dataset, split it, and pass it to " "`compute_word_piece_vocabulary()` with split=False." ) - data = tf.data.TextLineDataset(data) + path_ds = tf.data.Dataset.from_tensor_slices(data) + # Uses map to read filepaths. + data = path_ds.map( + lambda path: tf.io.read_file(path), + num_parallel_calls=tf.data.AUTOTUNE, + ) words_data = data.map( lambda text: pretokenize(text, lowercase, strip_accents, split),