From c249bd8ee592311afd596bcb0c91bf2fa4b1cdb4 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sat, 18 Jan 2025 02:04:08 +0000 Subject: [PATCH 1/5] Refactored modules/tokenizers to be a subdir of modules/transforms --- recipes/eleuther_eval.py | 2 +- tests/test_utils.py | 2 +- .../modules/tokenizers/test_sentencepiece.py | 2 +- tests/torchtune/modules/tokenizers/test_tiktoken.py | 2 +- tests/torchtune/modules/tokenizers/test_utils.py | 2 +- torchtune/data/_messages.py | 7 ++++--- torchtune/datasets/_alpaca.py | 2 +- torchtune/datasets/_chat.py | 2 +- torchtune/datasets/_cnn_dailymail.py | 2 +- torchtune/datasets/_grammar.py | 2 +- torchtune/datasets/_hh_rlhf_helpful.py | 2 +- torchtune/datasets/_instruct.py | 2 +- torchtune/datasets/_preference.py | 6 +++--- torchtune/datasets/_samsum.py | 2 +- torchtune/datasets/_sft.py | 12 +++++++----- torchtune/datasets/_slimorca.py | 2 +- torchtune/datasets/_stack_exchange_paired.py | 2 +- torchtune/datasets/_text_completion.py | 2 +- torchtune/datasets/_wikitext.py | 2 +- torchtune/models/clip/_tokenizer.py | 2 +- torchtune/models/gemma/_tokenizer.py | 4 ++-- torchtune/models/llama2/_tokenizer.py | 4 ++-- torchtune/models/llama3/_model_builders.py | 2 +- torchtune/models/llama3/_tokenizer.py | 5 ++++- torchtune/models/llama3_2_vision/_model_builders.py | 1 + torchtune/models/llama3_2_vision/_transform.py | 2 +- torchtune/models/mistral/_tokenizer.py | 4 ++-- torchtune/models/phi3/_model_builders.py | 2 +- torchtune/models/phi3/_tokenizer.py | 5 ++++- torchtune/models/qwen2/_model_builders.py | 2 +- torchtune/models/qwen2/_tokenizer.py | 2 +- torchtune/models/qwen2_5/_model_builders.py | 2 +- .../modules/{ => transforms}/tokenizers/__init__.py | 0 .../{ => transforms}/tokenizers/_sentencepiece.py | 3 +-- .../modules/{ => transforms}/tokenizers/_tiktoken.py | 2 +- .../modules/{ => transforms}/tokenizers/_utils.py | 4 ++-- 36 files changed, 56 insertions(+), 47 deletions(-) rename torchtune/modules/{ => transforms}/tokenizers/__init__.py (100%) rename torchtune/modules/{ => transforms}/tokenizers/_sentencepiece.py (98%) rename torchtune/modules/{ => transforms}/tokenizers/_tiktoken.py (98%) rename torchtune/modules/{ => transforms}/tokenizers/_utils.py (97%) diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py index 97744fb9b8..fd1ac8f6e1 100644 --- a/recipes/eleuther_eval.py +++ b/recipes/eleuther_eval.py @@ -31,8 +31,8 @@ from torchtune.modules import TransformerDecoder from torchtune.modules.common_utils import local_kv_cache from torchtune.modules.model_fusion import DeepFusionModel -from torchtune.modules.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ModelTokenizer from torchtune.recipe_interfaces import EvalRecipeInterface from torchtune.training import FullModelTorchTuneCheckpointer diff --git a/tests/test_utils.py b/tests/test_utils.py index 6497539869..ca28029710 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -20,8 +20,8 @@ import torch from torch import nn from torchtune.data import Message, PromptTemplate, truncate -from torchtune.modules.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ModelTokenizer skip_if_cuda_not_available = unittest.skipIf( not torch.cuda.is_available(), "CUDA is not available" diff --git a/tests/torchtune/modules/tokenizers/test_sentencepiece.py b/tests/torchtune/modules/tokenizers/test_sentencepiece.py index d11c1b9c52..217f0bf2d8 100644 --- a/tests/torchtune/modules/tokenizers/test_sentencepiece.py +++ b/tests/torchtune/modules/tokenizers/test_sentencepiece.py @@ -7,7 +7,7 @@ import pytest from tests.common import ASSETS -from torchtune.modules.tokenizers import SentencePieceBaseTokenizer +from torchtune.modules.transforms.tokenizers import SentencePieceBaseTokenizer class TestSentencePieceBaseTokenizer: diff --git a/tests/torchtune/modules/tokenizers/test_tiktoken.py b/tests/torchtune/modules/tokenizers/test_tiktoken.py index e7e69f62d3..5d3608d4bd 100644 --- a/tests/torchtune/modules/tokenizers/test_tiktoken.py +++ b/tests/torchtune/modules/tokenizers/test_tiktoken.py @@ -8,7 +8,7 @@ from tests.common import ASSETS from torchtune.models.llama3._tokenizer import CL100K_PATTERN -from torchtune.modules.tokenizers import TikTokenBaseTokenizer +from torchtune.modules.transforms.tokenizers import TikTokenBaseTokenizer class TestTikTokenBaseTokenizer: diff --git a/tests/torchtune/modules/tokenizers/test_utils.py b/tests/torchtune/modules/tokenizers/test_utils.py index 2c49d82a5a..e3a11e6f36 100644 --- a/tests/torchtune/modules/tokenizers/test_utils.py +++ b/tests/torchtune/modules/tokenizers/test_utils.py @@ -9,7 +9,7 @@ from tests.test_utils import DummyTokenizer from torchtune.data import Message -from torchtune.modules.tokenizers import tokenize_messages_no_special_tokens +from torchtune.modules.transforms.tokenizers import tokenize_messages_no_special_tokens class TestTokenizerUtils: diff --git a/torchtune/data/_messages.py b/torchtune/data/_messages.py index a4e00834c2..170970e5c5 100644 --- a/torchtune/data/_messages.py +++ b/torchtune/data/_messages.py @@ -22,9 +22,10 @@ class Message: """ This class represents individual messages in a fine-tuning dataset. It supports - text-only content, text with interleaved images, and tool calls. The :class:`~torchtune.modules.tokenizers.ModelTokenizer` - will tokenize the content of the message using ``tokenize_messages`` and attach - the appropriate special tokens based on the flags set in this class. + text-only content, text with interleaved images, and tool calls. The + :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` will tokenize + the content of the message using ``tokenize_messages`` and attach the appropriate + special tokens based on the flags set in this class. Args: role (Role): role of the message writer. Can be "system" for system prompts, diff --git a/torchtune/datasets/_alpaca.py b/torchtune/datasets/_alpaca.py index a881c149b0..c7795c8f28 100644 --- a/torchtune/datasets/_alpaca.py +++ b/torchtune/datasets/_alpaca.py @@ -12,7 +12,7 @@ from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def alpaca_dataset( diff --git a/torchtune/datasets/_chat.py b/torchtune/datasets/_chat.py index f126fb3979..1e3962e14b 100644 --- a/torchtune/datasets/_chat.py +++ b/torchtune/datasets/_chat.py @@ -9,7 +9,7 @@ from torchtune.data._messages import OpenAIToMessages, ShareGPTToMessages from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def chat_dataset( diff --git a/torchtune/datasets/_cnn_dailymail.py b/torchtune/datasets/_cnn_dailymail.py index d3c3af1f93..3995d46b22 100644 --- a/torchtune/datasets/_cnn_dailymail.py +++ b/torchtune/datasets/_cnn_dailymail.py @@ -8,7 +8,7 @@ from torchtune.datasets._text_completion import TextCompletionDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def cnn_dailymail_articles_dataset( diff --git a/torchtune/datasets/_grammar.py b/torchtune/datasets/_grammar.py index 9e9d700ea6..02970cedef 100644 --- a/torchtune/datasets/_grammar.py +++ b/torchtune/datasets/_grammar.py @@ -10,7 +10,7 @@ from torchtune.data import InputOutputToMessages from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def grammar_dataset( diff --git a/torchtune/datasets/_hh_rlhf_helpful.py b/torchtune/datasets/_hh_rlhf_helpful.py index e466a8a4fd..8eea7e1a46 100644 --- a/torchtune/datasets/_hh_rlhf_helpful.py +++ b/torchtune/datasets/_hh_rlhf_helpful.py @@ -8,7 +8,7 @@ from torchtune.data import ChosenRejectedToMessages from torchtune.datasets._preference import PreferenceDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def hh_rlhf_helpful_dataset( diff --git a/torchtune/datasets/_instruct.py b/torchtune/datasets/_instruct.py index 0dfa46146d..20168aac1d 100644 --- a/torchtune/datasets/_instruct.py +++ b/torchtune/datasets/_instruct.py @@ -9,7 +9,7 @@ from torchtune.data import InputOutputToMessages from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def instruct_dataset( diff --git a/torchtune/datasets/_preference.py b/torchtune/datasets/_preference.py index dea4eec852..c9615fe93c 100644 --- a/torchtune/datasets/_preference.py +++ b/torchtune/datasets/_preference.py @@ -11,10 +11,10 @@ from torch.utils.data import Dataset from torchtune.data import ChosenRejectedToMessages, CROSS_ENTROPY_IGNORE_IDX - -from torchtune.modules.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ModelTokenizer + class PreferenceDataset(Dataset): """ @@ -84,7 +84,7 @@ class requires the dataset to have "chosen" and "rejected" model responses. Thes of messages are stored in the ``"chosen"`` and ``"rejected"`` keys. tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method. Since PreferenceDataset only supports text data, it requires a - :class:`~torchtune.modules.tokenizers.ModelTokenizer` instead of the ``model_transform`` in + :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` instead of the ``model_transform`` in :class:`~torchtune.datasets.SFTDataset`. filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See the Hugging Face `docs `_ for more diff --git a/torchtune/datasets/_samsum.py b/torchtune/datasets/_samsum.py index 905911d736..bd7f7dd8eb 100644 --- a/torchtune/datasets/_samsum.py +++ b/torchtune/datasets/_samsum.py @@ -10,7 +10,7 @@ from torchtune.data import InputOutputToMessages from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def samsum_dataset( diff --git a/torchtune/datasets/_sft.py b/torchtune/datasets/_sft.py index 9ee11244b6..0d1461dd0d 100644 --- a/torchtune/datasets/_sft.py +++ b/torchtune/datasets/_sft.py @@ -69,11 +69,13 @@ class SFTDataset(Dataset): multimodal datasets requires processing the images in a way specific to the vision encoder being used by the model and is agnostic to the specific dataset. - Tokenization is handled by the ``model_transform``. All :class:`~torchtune.modules.tokenizers.ModelTokenizer` - can be treated as a ``model_transform`` since it uses the model-specific tokenizer to - transform the list of messages outputted from the ``message_transform`` into tokens - used by the model for training. Text-only datasets will simply pass the :class:`~torchtune.modules.tokenizers.ModelTokenizer` - into ``model_transform``. Tokenizers handle prompt templating, if configured. + Tokenization is handled by the ``model_transform``. All + :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` can be treated as + a ``model_transform`` since it uses the model-specific tokenizer to transform the + list of messages outputted from the ``message_transform`` into tokens used by the + model for training. Text-only datasets will simply pass the + :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` into ``model_transform``. + Tokenizers handle prompt templating, if configured. Args: source (str): path to dataset repository on Hugging Face. For local datasets, diff --git a/torchtune/datasets/_slimorca.py b/torchtune/datasets/_slimorca.py index 126b6b92e4..2701b2d717 100644 --- a/torchtune/datasets/_slimorca.py +++ b/torchtune/datasets/_slimorca.py @@ -10,7 +10,7 @@ from torchtune.datasets._packed import PackedDataset from torchtune.datasets._sft import SFTDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def slimorca_dataset( diff --git a/torchtune/datasets/_stack_exchange_paired.py b/torchtune/datasets/_stack_exchange_paired.py index 09eda929fe..a111d415d2 100644 --- a/torchtune/datasets/_stack_exchange_paired.py +++ b/torchtune/datasets/_stack_exchange_paired.py @@ -8,8 +8,8 @@ from torchtune.data import Message from torchtune.datasets._preference import PreferenceDataset -from torchtune.modules.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ModelTokenizer class StackExchangePairedToMessages(Transform): diff --git a/torchtune/datasets/_text_completion.py b/torchtune/datasets/_text_completion.py index 5b5cc94299..342c6aa816 100644 --- a/torchtune/datasets/_text_completion.py +++ b/torchtune/datasets/_text_completion.py @@ -10,7 +10,7 @@ from torch.utils.data import Dataset from torchtune.data._utils import truncate from torchtune.datasets._packed import PackedDataset -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer class TextCompletionDataset(Dataset): diff --git a/torchtune/datasets/_wikitext.py b/torchtune/datasets/_wikitext.py index 01111a25c6..4f9ada6741 100644 --- a/torchtune/datasets/_wikitext.py +++ b/torchtune/datasets/_wikitext.py @@ -13,7 +13,7 @@ TextCompletionDataset, ) -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer def wikitext_dataset( diff --git a/torchtune/models/clip/_tokenizer.py b/torchtune/models/clip/_tokenizer.py index 69fed32c72..cdab2c9c05 100644 --- a/torchtune/models/clip/_tokenizer.py +++ b/torchtune/models/clip/_tokenizer.py @@ -7,7 +7,7 @@ import regex as re -from torchtune.modules.tokenizers._utils import BaseTokenizer +from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer WORD_BOUNDARY = "" diff --git a/torchtune/models/gemma/_tokenizer.py b/torchtune/models/gemma/_tokenizer.py index e5eb89e230..dc5d2eadf8 100644 --- a/torchtune/models/gemma/_tokenizer.py +++ b/torchtune/models/gemma/_tokenizer.py @@ -7,12 +7,12 @@ from typing import Any, List, Mapping, Optional, Tuple from torchtune.data import Message, PromptTemplate -from torchtune.modules.tokenizers import ( +from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ( ModelTokenizer, SentencePieceBaseTokenizer, tokenize_messages_no_special_tokens, ) -from torchtune.modules.transforms import Transform WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"] diff --git a/torchtune/models/llama2/_tokenizer.py b/torchtune/models/llama2/_tokenizer.py index 078494c531..4e2ab6a40c 100644 --- a/torchtune/models/llama2/_tokenizer.py +++ b/torchtune/models/llama2/_tokenizer.py @@ -8,12 +8,12 @@ from torchtune.data import Message, PromptTemplate from torchtune.models.llama2._prompt_template import Llama2ChatTemplate -from torchtune.modules.tokenizers import ( +from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ( ModelTokenizer, SentencePieceBaseTokenizer, tokenize_messages_no_special_tokens, ) -from torchtune.modules.transforms import Transform WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"] diff --git a/torchtune/models/llama3/_model_builders.py b/torchtune/models/llama3/_model_builders.py index 0ddca90189..6c13e37cff 100644 --- a/torchtune/models/llama3/_model_builders.py +++ b/torchtune/models/llama3/_model_builders.py @@ -13,7 +13,7 @@ from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES -from torchtune.modules.tokenizers import parse_hf_tokenizer_json +from torchtune.modules.transforms.tokenizers import parse_hf_tokenizer_json """ diff --git a/torchtune/models/llama3/_tokenizer.py b/torchtune/models/llama3/_tokenizer.py index 50ea0a7581..012aa9f584 100644 --- a/torchtune/models/llama3/_tokenizer.py +++ b/torchtune/models/llama3/_tokenizer.py @@ -8,8 +8,11 @@ from typing import Any, Dict, List, Mapping, Optional, Tuple from torchtune.data import Message, PromptTemplate, truncate -from torchtune.modules.tokenizers import ModelTokenizer, TikTokenBaseTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ( + ModelTokenizer, + TikTokenBaseTokenizer, +) CL100K_PATTERN = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" # noqa diff --git a/torchtune/models/llama3_2_vision/_model_builders.py b/torchtune/models/llama3_2_vision/_model_builders.py index 4f035f92c5..beb4c12846 100644 --- a/torchtune/models/llama3_2_vision/_model_builders.py +++ b/torchtune/models/llama3_2_vision/_model_builders.py @@ -20,6 +20,7 @@ from torchtune.models.llama3_2_vision._transform import Llama3VisionTransform from torchtune.modules.model_fusion import DeepFusionModel from torchtune.modules.peft import LORA_ATTN_MODULES +from torchtune.modules.transforms.tokenizers import parse_hf_tokenizer_json def llama3_2_vision_transform( diff --git a/torchtune/models/llama3_2_vision/_transform.py b/torchtune/models/llama3_2_vision/_transform.py index eaf627d027..534ed4ab1c 100644 --- a/torchtune/models/llama3_2_vision/_transform.py +++ b/torchtune/models/llama3_2_vision/_transform.py @@ -10,8 +10,8 @@ from torchtune.models.clip import CLIPImageTransform from torchtune.models.llama3 import llama3_tokenizer -from torchtune.modules.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform, VisionCrossAttentionMask +from torchtune.modules.transforms.tokenizers import ModelTokenizer class Llama3VisionTransform(ModelTokenizer, Transform): diff --git a/torchtune/models/mistral/_tokenizer.py b/torchtune/models/mistral/_tokenizer.py index c3bbc8a4a7..49617220c3 100644 --- a/torchtune/models/mistral/_tokenizer.py +++ b/torchtune/models/mistral/_tokenizer.py @@ -8,12 +8,12 @@ from torchtune.data import Message, PromptTemplate from torchtune.models.mistral._prompt_template import MistralChatTemplate -from torchtune.modules.tokenizers import ( +from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ( ModelTokenizer, SentencePieceBaseTokenizer, tokenize_messages_no_special_tokens, ) -from torchtune.modules.transforms import Transform WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"] diff --git a/torchtune/models/phi3/_model_builders.py b/torchtune/models/phi3/_model_builders.py index 91d42623d7..e1275df783 100644 --- a/torchtune/models/phi3/_model_builders.py +++ b/torchtune/models/phi3/_model_builders.py @@ -6,7 +6,7 @@ from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES from functools import partial -from torchtune.modules.tokenizers import parse_hf_tokenizer_json +from torchtune.modules.transforms.tokenizers import parse_hf_tokenizer_json from torchtune.data._prompt_templates import _TemplateType from torchtune.data._prompt_templates import _get_prompt_template diff --git a/torchtune/models/phi3/_tokenizer.py b/torchtune/models/phi3/_tokenizer.py index 38707bf26e..44f66b5934 100644 --- a/torchtune/models/phi3/_tokenizer.py +++ b/torchtune/models/phi3/_tokenizer.py @@ -9,8 +9,11 @@ from torchtune.data._messages import Message from torchtune.data._prompt_templates import PromptTemplate from torchtune.data._utils import truncate -from torchtune.modules.tokenizers import ModelTokenizer, SentencePieceBaseTokenizer from torchtune.modules.transforms import Transform +from torchtune.modules.transforms.tokenizers import ( + ModelTokenizer, + SentencePieceBaseTokenizer, +) PHI3_SPECIAL_TOKENS = { "<|endoftext|>": 32000, diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py index 2a0ee06f83..f1ca5b8506 100644 --- a/torchtune/models/qwen2/_model_builders.py +++ b/torchtune/models/qwen2/_model_builders.py @@ -11,7 +11,7 @@ from torchtune.models.qwen2._tokenizer import QWEN2_SPECIAL_TOKENS, Qwen2Tokenizer from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES -from torchtune.modules.tokenizers import parse_hf_tokenizer_json +from torchtune.modules.transforms.tokenizers import parse_hf_tokenizer_json """ Model builders build specific instantiations using component builders. For example diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py index 0e4ee6bd35..dd6d038003 100644 --- a/torchtune/models/qwen2/_tokenizer.py +++ b/torchtune/models/qwen2/_tokenizer.py @@ -11,7 +11,7 @@ import regex as re from torchtune.data import ChatMLTemplate, Message, PromptTemplate, truncate -from torchtune.modules.tokenizers import ModelTokenizer +from torchtune.modules.transforms.tokenizers import ModelTokenizer PRETOKENIZE_REGEX = ( r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|" diff --git a/torchtune/models/qwen2_5/_model_builders.py b/torchtune/models/qwen2_5/_model_builders.py index 7d39802375..716ae48329 100644 --- a/torchtune/models/qwen2_5/_model_builders.py +++ b/torchtune/models/qwen2_5/_model_builders.py @@ -11,7 +11,7 @@ from torchtune.models.qwen2_5._tokenizer import QWEN2_5_SPECIAL_TOKENS, Qwen2_5Tokenizer from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES -from torchtune.modules.tokenizers import parse_hf_tokenizer_json +from torchtune.modules.transforms.tokenizers import parse_hf_tokenizer_json """ Model builders build specific instantiations using component builders. For example diff --git a/torchtune/modules/tokenizers/__init__.py b/torchtune/modules/transforms/tokenizers/__init__.py similarity index 100% rename from torchtune/modules/tokenizers/__init__.py rename to torchtune/modules/transforms/tokenizers/__init__.py diff --git a/torchtune/modules/tokenizers/_sentencepiece.py b/torchtune/modules/transforms/tokenizers/_sentencepiece.py similarity index 98% rename from torchtune/modules/tokenizers/_sentencepiece.py rename to torchtune/modules/transforms/tokenizers/_sentencepiece.py index 0b22b63ee3..8d98617378 100644 --- a/torchtune/modules/tokenizers/_sentencepiece.py +++ b/torchtune/modules/transforms/tokenizers/_sentencepiece.py @@ -7,8 +7,7 @@ from typing import List, Optional from sentencepiece import SentencePieceProcessor - -from torchtune.modules.tokenizers._utils import BaseTokenizer +from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"] diff --git a/torchtune/modules/tokenizers/_tiktoken.py b/torchtune/modules/transforms/tokenizers/_tiktoken.py similarity index 98% rename from torchtune/modules/tokenizers/_tiktoken.py rename to torchtune/modules/transforms/tokenizers/_tiktoken.py index 077b22b0cd..64733b4634 100644 --- a/torchtune/modules/tokenizers/_tiktoken.py +++ b/torchtune/modules/transforms/tokenizers/_tiktoken.py @@ -8,7 +8,7 @@ from tiktoken import Encoding from tiktoken.load import load_tiktoken_bpe -from torchtune.modules.tokenizers._utils import BaseTokenizer +from torchtune.modules.transforms.tokenizers._utils import BaseTokenizer # Constants controlling encode logic MAX_ENCODE_CHARS = 400_000 diff --git a/torchtune/modules/tokenizers/_utils.py b/torchtune/modules/transforms/tokenizers/_utils.py similarity index 97% rename from torchtune/modules/tokenizers/_utils.py rename to torchtune/modules/transforms/tokenizers/_utils.py index b580eda1c0..ff374738c7 100644 --- a/torchtune/modules/tokenizers/_utils.py +++ b/torchtune/modules/transforms/tokenizers/_utils.py @@ -14,8 +14,8 @@ class BaseTokenizer(Protocol): """ Abstract token encoding model that implements ``encode`` and ``decode`` methods. - See :class:`~torchtune.modules.tokenizers.SentencePieceBaseTokenizer` and - :class:`~torchtune.modules.tokenizers.TikTokenBaseTokenizer` for example implementations of this protocol. + See :class:`~torchtune.modules.transforms.tokenizers.SentencePieceBaseTokenizer` and + :class:`~torchtune.modules.transforms.tokenizers.TikTokenBaseTokenizer` for example implementations of this protocol. """ def encode(self, text: str, **kwargs: Dict[str, Any]) -> List[int]: From fc2c9e0ae08a44c4044880e218bbbf9b6db710dd Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sun, 5 Jan 2025 11:15:20 -0800 Subject: [PATCH 2/5] Update documentation to reflect tokenizers refactor under transformers module --- docs/source/api_ref_modules.rst | 12 ++++++------ docs/source/basics/tokenizers.rst | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst index 5eb8fff358..979e57347f 100644 --- a/docs/source/api_ref_modules.rst +++ b/docs/source/api_ref_modules.rst @@ -48,10 +48,10 @@ model specific tokenizers. :toctree: generated/ :nosignatures: - tokenizers.SentencePieceBaseTokenizer - tokenizers.TikTokenBaseTokenizer - tokenizers.ModelTokenizer - tokenizers.BaseTokenizer + transforms.tokenizers.SentencePieceBaseTokenizer + transforms.tokenizers.TikTokenBaseTokenizer + transforms.tokenizers.ModelTokenizer + transforms.tokenizers.BaseTokenizer Tokenizer Utilities ------------------- @@ -61,8 +61,8 @@ These are helper methods that can be used by any tokenizer. :toctree: generated/ :nosignatures: - tokenizers.tokenize_messages_no_special_tokens - tokenizers.parse_hf_tokenizer_json + transforms.tokenizers.tokenize_messages_no_special_tokens + transforms.tokenizers.parse_hf_tokenizer_json PEFT Components diff --git a/docs/source/basics/tokenizers.rst b/docs/source/basics/tokenizers.rst index d637961c54..47be88fe0c 100644 --- a/docs/source/basics/tokenizers.rst +++ b/docs/source/basics/tokenizers.rst @@ -168,7 +168,7 @@ For example, here we change the ``"<|begin_of_text|>"`` and ``"<|end_of_text|>"` Base tokenizers --------------- -:class:`~torchtune.modules.tokenizers.BaseTokenizer` are the underlying byte-pair encoding modules that perform the actual raw string to token ID conversion and back. +:class:`~torchtune.modules.transforms.tokenizers.BaseTokenizer` are the underlying byte-pair encoding modules that perform the actual raw string to token ID conversion and back. In torchtune, they are required to implement ``encode`` and ``decode`` methods, which are called by the :ref:`model_tokenizers` to convert between raw text and token IDs. @@ -202,13 +202,13 @@ between raw text and token IDs. """ pass -If you load any :ref:`model_tokenizers`, you can see that it calls its underlying :class:`~torchtune.modules.tokenizers.BaseTokenizer` +If you load any :ref:`model_tokenizers`, you can see that it calls its underlying :class:`~torchtune.modules.transforms.tokenizers.BaseTokenizer` to do the actual encoding and decoding. .. code-block:: python from torchtune.models.mistral import mistral_tokenizer - from torchtune.modules.tokenizers import SentencePieceBaseTokenizer + from torchtune.modules.transforms.tokenizers import SentencePieceBaseTokenizer m_tokenizer = mistral_tokenizer("/tmp/Mistral-7B-v0.1/tokenizer.model") # Mistral uses SentencePiece for its underlying BPE @@ -227,7 +227,7 @@ to do the actual encoding and decoding. Model tokenizers ---------------- -:class:`~torchtune.modules.tokenizers.ModelTokenizer` are specific to a particular model. They are required to implement the ``tokenize_messages`` method, +:class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` are specific to a particular model. They are required to implement the ``tokenize_messages`` method, which converts a list of Messages into a list of token IDs. .. code-block:: python @@ -259,7 +259,7 @@ is because they add all the necessary special tokens or prompt templates require .. code-block:: python from torchtune.models.mistral import mistral_tokenizer - from torchtune.modules.tokenizers import SentencePieceBaseTokenizer + from torchtune.modules.transforms.tokenizers import SentencePieceBaseTokenizer from torchtune.data import Message m_tokenizer = mistral_tokenizer("/tmp/Mistral-7B-v0.1/tokenizer.model") From e4518340a2b0d3522e221a49f2504b72f5cbe703 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sat, 18 Jan 2025 03:20:31 +0000 Subject: [PATCH 3/5] update tests to accomodate `tokenizers` move --- .../modules/{ => transforms}/tokenizers/test_sentencepiece.py | 0 .../modules/{ => transforms}/tokenizers/test_tiktoken.py | 0 .../modules/{ => transforms}/tokenizers/test_utils.py | 0 torchtune/models/t5/_tokenizer.py | 4 +++- 4 files changed, 3 insertions(+), 1 deletion(-) rename tests/torchtune/modules/{ => transforms}/tokenizers/test_sentencepiece.py (100%) rename tests/torchtune/modules/{ => transforms}/tokenizers/test_tiktoken.py (100%) rename tests/torchtune/modules/{ => transforms}/tokenizers/test_utils.py (100%) diff --git a/tests/torchtune/modules/tokenizers/test_sentencepiece.py b/tests/torchtune/modules/transforms/tokenizers/test_sentencepiece.py similarity index 100% rename from tests/torchtune/modules/tokenizers/test_sentencepiece.py rename to tests/torchtune/modules/transforms/tokenizers/test_sentencepiece.py diff --git a/tests/torchtune/modules/tokenizers/test_tiktoken.py b/tests/torchtune/modules/transforms/tokenizers/test_tiktoken.py similarity index 100% rename from tests/torchtune/modules/tokenizers/test_tiktoken.py rename to tests/torchtune/modules/transforms/tokenizers/test_tiktoken.py diff --git a/tests/torchtune/modules/tokenizers/test_utils.py b/tests/torchtune/modules/transforms/tokenizers/test_utils.py similarity index 100% rename from tests/torchtune/modules/tokenizers/test_utils.py rename to tests/torchtune/modules/transforms/tokenizers/test_utils.py diff --git a/torchtune/models/t5/_tokenizer.py b/torchtune/models/t5/_tokenizer.py index f89dff00f6..e4fa9c539e 100644 --- a/torchtune/models/t5/_tokenizer.py +++ b/torchtune/models/t5/_tokenizer.py @@ -5,7 +5,9 @@ # LICENSE file in the root directory of this source tree. from typing import Any, Dict, List -from torchtune.modules.tokenizers._sentencepiece import SentencePieceBaseTokenizer +from torchtune.modules.transforms.tokenizers._sentencepiece import ( + SentencePieceBaseTokenizer, +) class T5Tokenizer(SentencePieceBaseTokenizer): From 88c65e63ef2995b9b62849fbb394f03f04388ba7 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Tue, 21 Jan 2025 20:34:09 +0000 Subject: [PATCH 4/5] fix backward compatibility --- torchtune/modules/tokenizers/__init__.py | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 torchtune/modules/tokenizers/__init__.py diff --git a/torchtune/modules/tokenizers/__init__.py b/torchtune/modules/tokenizers/__init__.py new file mode 100644 index 0000000000..59fe15bfa7 --- /dev/null +++ b/torchtune/modules/tokenizers/__init__.py @@ -0,0 +1,39 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# NOTE: This file is maintained for backward compatibility purposes. +# The imports below point to the new location in `torchtune.modules.transforms.tokenizers`. +# The import paths will be removed in v0.7. Please update your code to use the new path +# (torchtune.modules.transforms.tokenizers) to avoid breaking changes in future releases. + + +import warnings + +from torchtune.modules.transforms.tokenizers import ( + BaseTokenizer, + ModelTokenizer, + parse_hf_tokenizer_json, + SentencePieceBaseTokenizer, + TikTokenBaseTokenizer, + tokenize_messages_no_special_tokens, +) + +warnings.warn( + "The import path 'torchtune.modules.tokenizers' is deprecated and will be removed in v0.7. " + "Please update your imports to 'torchtune.modules.transforms.tokenizers'.", + DeprecationWarning, + stacklevel=2, +) + + +__all__ = [ + "SentencePieceBaseTokenizer", + "TikTokenBaseTokenizer", + "ModelTokenizer", + "BaseTokenizer", + "tokenize_messages_no_special_tokens", + "parse_hf_tokenizer_json", +] From aaf416fce3d00d3934cc6eb0c46db484677b1e35 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sat, 25 Jan 2025 23:32:54 +0000 Subject: [PATCH 5/5] remove `all` and unused import. Update docs --- docs/source/basics/custom_components.rst | 2 +- docs/source/basics/model_transforms.rst | 2 +- torchtune/models/llama3_2_vision/_model_builders.py | 1 - torchtune/modules/tokenizers/__init__.py | 12 ++---------- 4 files changed, 4 insertions(+), 13 deletions(-) diff --git a/docs/source/basics/custom_components.rst b/docs/source/basics/custom_components.rst index f252cb197e..0f742644dc 100644 --- a/docs/source/basics/custom_components.rst +++ b/docs/source/basics/custom_components.rst @@ -117,7 +117,7 @@ our models in torchtune - see :func:`~torchtune.models.llama3_2_vision.llama3_2_ # from torchtune.datasets import SFTDataset, PackedDataset from torchtune.data import InputOutputToMessages - from torchtune.modules.tokenizers import ModelTokenizer + from torchtune.modules.transforms.tokenizers import ModelTokenizer # Example builder function for a custom code instruct dataset not in torchtune, but using # different dataset building blocks from torchtune diff --git a/docs/source/basics/model_transforms.rst b/docs/source/basics/model_transforms.rst index c10cb1abd8..71e7e08bd5 100644 --- a/docs/source/basics/model_transforms.rst +++ b/docs/source/basics/model_transforms.rst @@ -101,7 +101,7 @@ The following methods are required on the model transform: .. code-block:: python - from torchtune.modules.tokenizers import ModelTokenizer + from torchtune.modules.transforms.tokenizers import ModelTokenizer from torchtune.modules.transforms import Transform class MyMultimodalTransform(ModelTokenizer, Transform): diff --git a/torchtune/models/llama3_2_vision/_model_builders.py b/torchtune/models/llama3_2_vision/_model_builders.py index beb4c12846..4f035f92c5 100644 --- a/torchtune/models/llama3_2_vision/_model_builders.py +++ b/torchtune/models/llama3_2_vision/_model_builders.py @@ -20,7 +20,6 @@ from torchtune.models.llama3_2_vision._transform import Llama3VisionTransform from torchtune.modules.model_fusion import DeepFusionModel from torchtune.modules.peft import LORA_ATTN_MODULES -from torchtune.modules.transforms.tokenizers import parse_hf_tokenizer_json def llama3_2_vision_transform( diff --git a/torchtune/modules/tokenizers/__init__.py b/torchtune/modules/tokenizers/__init__.py index 59fe15bfa7..f10a9b3dd6 100644 --- a/torchtune/modules/tokenizers/__init__.py +++ b/torchtune/modules/tokenizers/__init__.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# flake8: noqa: F401 + # NOTE: This file is maintained for backward compatibility purposes. # The imports below point to the new location in `torchtune.modules.transforms.tokenizers`. # The import paths will be removed in v0.7. Please update your code to use the new path @@ -27,13 +29,3 @@ DeprecationWarning, stacklevel=2, ) - - -__all__ = [ - "SentencePieceBaseTokenizer", - "TikTokenBaseTokenizer", - "ModelTokenizer", - "BaseTokenizer", - "tokenize_messages_no_special_tokens", - "parse_hf_tokenizer_json", -]