diff --git a/.ci/scripts/check_gibberish b/.ci/scripts/check_gibberish index 5d9783b3b..912020a5a 100755 --- a/.ci/scripts/check_gibberish +++ b/.ci/scripts/check_gibberish @@ -24,6 +24,18 @@ else fi fi +####################################################################### +# +# check whether aspell spell check evailable + +if command -v aspell &> /dev/null; then + echo "Checking $TMPFILE for gibberish" +else + echo "Aspell is not installed or not in PATH." + echo "Gibberish unchecked in $TMPFILE" + exit 0 +fi + ####################################################################### # # run spell check on the extracted sequence diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs index 521cfa811..71f074cef 100755 --- a/.ci/scripts/run-docs +++ b/.ci/scripts/run-docs @@ -1,144 +1,67 @@ -# /bin/bash -x +#!/bin/bash -x -if [ "X$1" == "X" ]; then +# Check if an argument was provided +if [ -z "$1" ]; then echo "Must specify document to run" exit 1 fi -if [ "$1" == "readme" ]; then - echo "::group::Create script to run README" - python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-readme.sh - echo "::endgroup::" - - echo "::group::Run README" - echo "*******************************************" - cat ./run-readme.sh - echo "*******************************************" - bash -x ./run-readme.sh - echo "::endgroup::" - - exit 0 -fi - -if [ "$1" == "quantization" ]; then - echo "::group::Create script to run quantization" - python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-quantization.sh - echo "::endgroup::" - - echo "::group::Run quantization" - echo "*******************************************" - cat ./run-quantization.sh - echo "*******************************************" - bash -x ./run-quantization.sh - echo "::endgroup::" - - exit 0 -fi - -if [ "$1" == "gguf" ]; then - echo "::group::Create script to run gguf" - python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-gguf.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-gguf.sh - echo "::endgroup::" - - echo "::group::Run gguf" - echo "*******************************************" - cat ./run-gguf.sh - echo "*******************************************" - bash -x ./run-gguf.sh - echo "::endgroup::" -fi - - -if [ "$1" == "advanced" ]; then - echo "::group::Create script to run advanced" - python3 torchchat/utils/scripts/updown.py --file docs/ADVANCED-USERS.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-advanced.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-advanced.sh - echo "::endgroup::" - - echo "::group::Run advanced" - echo "*******************************************" - cat ./run-advanced.sh - echo "*******************************************" - bash -x ./run-advanced.sh - echo "::endgroup::" -fi - -if [ "$1" == "evaluation" ]; then - echo "::group::Create script to run evaluation" - python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-evaluation.sh - echo "::endgroup::" - - echo "::group::Run evaluation" - echo "*******************************************" - cat ./run-evaluation.sh - echo "*******************************************" - bash -x ./run-evaluation.sh -fi - -if [ "$1" == "multimodal" ]; then - - # Expecting that this might fail this test as-is, because - # it's the first on-pr test depending on github secrets for access with HF token access - - echo "::group::Create script to run multimodal" - python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-multimodal.sh - echo "::endgroup::" - - echo "::group::Run multimodal" - echo "*******************************************" - cat ./run-multimodal.sh - echo "*******************************************" - bash -x ./run-multimodal.sh - echo "::endgroup::" -fi - -if [ "$1" == "native" ]; then - - echo "::group::Create script to run native-execution" - python3 torchchat/utils/scripts/updown.py --file docs/native-execution.md > ./run-native.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-native.sh - echo "::endgroup::" - - echo "::group::Run native-execution" - echo "*******************************************" - cat ./run-native.sh - echo "*******************************************" - bash -x ./run-native.sh - echo "::endgroup::" -fi - -if [ "$1" == "distributed" ]; then - - echo "::group::Create script to run distributed" - python3 torchchat/utils/scripts/updown.py --file docs/distributed.md > ./run-distributed.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-distributed.sh - echo "::endgroup::" - - echo "::group::Run distributed" - echo "*******************************************" - cat ./run-distributed.sh - echo "*******************************************" - bash -x ./run-distributed.sh - echo "::endgroup::" -fi +# Pre-initialize variables +filepath="" +parameters="--replace 'llama3:stories15M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN" +script_name="./run-${1}.sh" # Dynamically initialize script name + +# Use a case statement to handle the $1 argument +case "$1" in + "readme") + filepath="README.md" + ;; + "quantization") + filepath="docs/quantization.md" + ;; + "gguf") + filepath="docs/GGUF.md" + ;; + "advanced") + filepath="docs/ADVANCED-USERS.md" + ;; + "evaluation") + filepath="torchchat/utils/docs/evaluation.md" + ;; + "multimodal") + filepath="docs/multimodal.md" + parameters="" # Clear parameters + ;; + "native") + filepath="docs/native-execution.md" + parameters="" # Clear parameters + ;; + "distributed") + filepath="docs/distributed.md" + parameters="--replace 'llama3.1:stories110M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN" # Use stories110M to avoid need for authentication + ;; + "local") + filepath="docs/local-model.md" + parameters="" # Clear parameters + ;; + + *) + echo "Unknown option: $1" + exit 1 + ;; +esac + +# Generate the script +echo "::group::Create script to run $1" +python3 torchchat/utils/scripts/updown.py --file "$filepath" $parameters > "$script_name" +# if something happened to updown processor, and it did not error out, fail with an exit 1 +echo "exit 1" >> "$script_name" +echo "::endgroup::" + +# Run the script +echo "::group::Run $1" +echo "*******************************************" +cat "$script_name" +echo "*******************************************" +bash -x "$script_name" +echo "::endgroup::" diff --git a/.github/workflows/more-tests.yml b/.github/workflows/more-tests.yml index c2502e7e4..0d0316069 100644 --- a/.github/workflows/more-tests.yml +++ b/.github/workflows/more-tests.yml @@ -40,9 +40,10 @@ jobs: echo "::endgroup::" echo "::group::Run inference" - export MODEL_PATH=checkpoints/stories15M/stories15M.pt + export MODEL_DIR=checkpoints/stories15M/ + export MODEL_PATH=${MODEL_DIR}/stories15M.pt export MODEL_NAME=stories15M - export MODEL_DIR=/tmp + for DTYPE in bfloat16 float16 float32; do ################################################################### @@ -145,3 +146,65 @@ jobs: echo "tests complete" echo "******************************************" echo "::endgroup::" + + test-sdpa-backends-export: + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.4" + timeout: 60 + script: | + set -xeou pipefail + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + echo "::group::Download checkpoints" + # Install requirements + ./install/install_requirements.sh cuda + pip3 list + python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' + echo "::endgroup::" + + echo "::group::Download checkpoints" + mkdir -p checkpoints/stories15M + pushd checkpoints/stories15M + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt + wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model + popd + echo "::endgroup::" + + echo "::group::Run inference" + export MODEL_DIR=checkpoints/stories15M/ + export MODEL_PATH=${MODEL_DIR}/stories15M.pt + export MODEL_NAME=stories15M + + ./torchchat/utils/scripts/build_native.sh aoti + + for DEVICE in cpu cuda; do + # depending on how the parameter passing works, may only be able to do bfloat16 for aoti_run, similar to runner-cuda-dtype.yml + # (although the runner environment should not have an opinion what we us in the artifact, and we might suitably abstract that) + for DTYPE in bfloat16 float16 float32; do + for SDPA in 'math' 'flash_attention' 'efficient_attention' 'cudnn_attention'; do + echo "***************************************************************" + echo "*** $DEVICE $DTYPE $SDPA" + ################################################################### + # Export DSO and run with Python + python torchchat.py export --output-dso dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} + python torchchat.py generate --dso-path dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} --temperature 0 --prompt "Once upon a time" + ################################################################### + # Export AOTI and run with aoti_run + python torchchat.py export --output-aoti /tmp/model.pt2 --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} + ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "Once upon a time" + ################################################################### + done + done + done + + echo "tests complete" + echo "******************************************" + echo "::endgroup::" \ No newline at end of file diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml index 4d5cd7e14..db16bc80e 100644 --- a/.github/workflows/run-readme-pr-mps.yml +++ b/.github/workflows/run-readme-pr-mps.yml @@ -15,8 +15,8 @@ jobs: conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp conda activate test-readme-mps-macos set -x - # NS: Remove previous installation of torch first - # as this script does not isntall anything into conda env but rather as system dep + # NS: Remove previous installation of torch first + # as this script does not install anything into conda env but rather as system dep pip3 uninstall -y torch || true set -eou pipefail @@ -37,6 +37,7 @@ jobs: uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: runner: macos-m1-14 + timeout: 60 script: | set -x conda create -y -n test-quantization-mps-macos python=3.10.11 diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml index f32473435..37c27822b 100644 --- a/.github/workflows/run-readme-pr.yml +++ b/.github/workflows/run-readme-pr.yml @@ -306,3 +306,25 @@ jobs: echo "::endgroup::" TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native + + test-distributed-cuda: + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.4" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + .ci/scripts/run-docs distributed + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" diff --git a/README.md b/README.md index 2448b0b72..51db1bfca 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,11 @@ torchchat is a small codebase showcasing the ability to run large language models (LLMs) seamlessly. With torchchat, you can run LLMs using Python, within your own (C/C++) application (desktop or server) and on iOS and Android. > [!IMPORTANT] -> Update September 25, 2024: torchchat has multimodal support for **Llama3.2 11B**!! +> Update +> +> **February 3, 2025**: torchchat has support for [**DeepSeek R1 Distill: 8B**]( https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)! +> +> **September 25, 2024**: torchchat has multimodal support for **Llama3.2 11B**! > > To try it out, finish the [Installation](#Installation) section below, then hop > over to our [multimodal guide](docs/multimodal.md) to learn more. @@ -75,6 +79,7 @@ aliases. | [ibm-granite/granite-3.0-8b-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) |✅| Alias to `granite3-8b`.| | [ibm-granite/granite-3.1-2b-instruct](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct) |✅| Alias to `granite3.1-2b` and `granite3.1`.| | [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✅| Alias to `granite3.1-8b`.| +| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) |✅| Alias to `deepseek-r1:8b`.| ## Installation @@ -413,7 +418,7 @@ torchchat/utils/scripts/build_native.sh et Execute using the runner ```bash -cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time" +cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time" ``` diff --git a/docs/quantization.md b/docs/quantization.md index 704a7ed6a..56fd2182e 100644 --- a/docs/quantization.md +++ b/docs/quantization.md @@ -182,7 +182,7 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner: ``` -OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time," +OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -i "Once upon a time," # -l 3 ``` #### ExecuTorch @@ -193,7 +193,7 @@ python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"e Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file. It will not work with the `python torchchat.py generate` command. ``` -./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time," +./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l3 -i "Once upon a time," ``` ## Experimental TorchAO MPS lowbit kernels diff --git a/runner/run.cpp b/runner/run.cpp index e5c818cfa..d64c636bb 100644 --- a/runner/run.cpp +++ b/runner/run.cpp @@ -803,41 +803,53 @@ int main(int argc, char *argv[]) { } else { error_usage(); } - for (int i = 2; i < argc; i += 2) { + for (int i = 2; i < argc; i += 1) { // do some basic validation - if (i + 1 >= argc) { - error_usage(); - } // must have arg after flag + char *parm = argv[i+1]; + // uniarg means the arg comes right after the letter in accordance with posix + int uniarg = strlen(argv[i]) > 2; + if (argv[i][0] != '-') { error_usage(); } // must start with dash - if (strlen(argv[i]) != 2) { + + if (strlen(argv[i]) < 2) { error_usage(); - } // must be -x (one dash, one letter) + } // must have at least dash '-' and option letter + + if (uniarg) { + parm=&argv[i][2]; + } else if (i + 1 >= argc) { + error_usage(); + } // must have arg after option if flag is not contiguous to option + // read in the args if (argv[i][1] == 't') { - temperature = atof(argv[i + 1]); + temperature = atof(parm); } else if (argv[i][1] == 'p') { - topp = atof(argv[i + 1]); + topp = atof(parm); } else if (argv[i][1] == 's') { - rng_seed = atoi(argv[i + 1]); + rng_seed = atoi(parm); } else if (argv[i][1] == 'n') { - steps = atoi(argv[i + 1]); + steps = atoi(parm); } else if (argv[i][1] == 'v') { - vocab_size = atoi(argv[i + 1]); + vocab_size = atoi(parm); } else if (argv[i][1] == 'i') { - prompt = argv[i + 1]; + prompt = parm; } else if (argv[i][1] == 'z') { - tokenizer_path = argv[i + 1]; + tokenizer_path = parm; } else if (argv[i][1] == 'm') { - mode = argv[i + 1]; + mode = parm; } else if (argv[i][1] == 'y') { - system_prompt = argv[i + 1]; + system_prompt = parm; } else if (argv[i][1] == 'l') { - llama_ver = atoi(argv[i + 1]); + llama_ver = atoi(parm); } else { error_usage(); } + + // account for parameter + i += (uniarg)?0:1; } if (model_path == NULL) { diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py index d10ecb076..b77ee43ea 100644 --- a/tokenizer/hf_tokenizer.py +++ b/tokenizer/hf_tokenizer.py @@ -46,8 +46,14 @@ def __init__(self, file_path: str): if tokenizer_config_path is not None: with open(tokenizer_config_path, "r") as handle: tok_config = json.load(handle) - bos_token = tok_config.get("bos_token") - eos_token = tok_config.get("eos_token") + + def _extract_token(identifier: str) -> Optional[str]: + entry: Optional[Union[str, dict]] = tok_config.get(identifier) + return entry.get("content") if isinstance(entry, dict) else entry + + bos_token = _extract_token("bos_token") + eos_token = _extract_token("eos_token") + if bos_token is not None: self._bos_id = self._tokenizer.token_to_id(bos_token) if eos_token is not None: diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py index a5b23dfe3..1e04800ab 100644 --- a/torchchat/cli/builder.py +++ b/torchchat/cli/builder.py @@ -56,6 +56,7 @@ class BuilderArgs: gguf_kwargs: Optional[Dict[str, Any]] = None dso_path: Optional[Union[Path, str]] = None aoti_package_path: Optional[Union[Path, str]] = None + snapshot_path: Optional[Union[Path, str]] = None pte_path: Optional[Union[Path, str]] = None device: Optional[str] = None precision: torch.dtype = torch.float32 @@ -87,6 +88,7 @@ def __post_init__(self): or (self.dso_path and Path(self.dso_path).is_file()) or (self.aoti_package_path and Path(self.aoti_package_path).is_file()) or (self.pte_path and Path(self.pte_path).is_file()) + or (self.snapshot_path and Path(self.snapshot_path).is_file()) ): raise RuntimeError( "need to specify a valid checkpoint path, checkpoint dir, gguf path, DSO path, AOTI PACKAGE or PTE path" @@ -142,6 +144,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": dso_path = getattr(args, "dso_path", None) pte_path = getattr(args, "pte_path", None) aoti_package_path = getattr(args, "aoti_package_path", None) + snapshot_path = getattr(args, "snapshot_path", None) is_chat_model = False if args.is_chat_model: @@ -169,6 +172,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": output_pte_path = getattr(args, "output_pte_path", None) output_aoti_package_path = getattr(args, "output_aoti_package_path", None) output_dso_path = getattr(args, "output_dso_path", None) + output_snapshot_path = getattr(args, "output_snapshot_path", None) if output_pte_path and args.dtype.startswith("fast"): if args.dtype == "fast": # As per Kimish, float32 should be faster on ET XNNPACK @@ -206,6 +210,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": dso_path=dso_path, aoti_package_path=aoti_package_path, pte_path=pte_path, + snapshot_path=snapshot_path, device=args.device, precision=dtype, setup_caches=( @@ -631,6 +636,34 @@ def do_nothing(max_batch_size, max_seq_length): model = PTEModel(config, builder_args.pte_path) except Exception: raise RuntimeError(f"Failed to load ET compiled {builder_args.pte_path}") + elif builder_args.snapshot_path: + # Resolve ModelArgs for constructing the PTEModel + # If a manual params_path is provided, use that + if builder_args.params_path: + config: ModelArgs = ModelArgs.from_params(builder_args.params_path) + else: + # TODO: Instead of loading the whole model, refactor to call a + # helper that generate just model.config + with measure_time("Time to load model: {time:.02f} seconds"): + model = _load_model(builder_args) + device_sync(device=builder_args.device) + config = model.config + model = None + try: + model = torch.load(builder_args.snapshot_path, weights_only=False) + except Exception: + raise RuntimeError(f"Failed to load torchchat snapshot {builder_args.snapshot_path}") + # _active_backend() does not allow DSO & AOTI to be true. + # Choose either. + from torchchat.utils.build_utils import set_backend + set_backend (dso=True, pte=False, aoti_package=False) + if (model.config != config): + raise RuntimeError("loaded model architecture mismatch") + ## + ## import all libraries with custom kernels ans custom operators + ## that quantize may be pulling in + ## + elif builder_args.distributed: pp_degree = builder_args.pp tp_degree = builder_args.tp diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py index 70f404635..1d531c709 100644 --- a/torchchat/cli/cli.py +++ b/torchchat/cli/cli.py @@ -207,6 +207,12 @@ def _add_export_output_path_args(parser) -> None: default=None, help="Output to the specified AOT Inductor .dso model file", ) + exclusive_parser.add_argument( + "--output-snapshot-path", + type=str, + default=None, + help="Output to the specified PyTorch model and sha256 file", + ) exclusive_parser.add_argument( "--output-aoti-package-path", type=str, @@ -254,7 +260,13 @@ def _add_exported_input_path_args(parser) -> None: default=None, help="Use the specified ExecuTorch .pte model file", ) - + exclusive_parser.add_argument( + "--snapshot-path", + type=Path, + default=None, + help="Use the specified torchchat snaphot .tc model file", + ) + # Add CLI Args related to JIT downloading of model artifacts def _add_jit_downloading_args(parser) -> None: diff --git a/torchchat/export.py b/torchchat/export.py index 829bd47db..997639ffe 100644 --- a/torchchat/export.py +++ b/torchchat/export.py @@ -28,6 +28,31 @@ default_device = "cpu" +""" +Export Snapshot +""" + + +def export_snapshot( + model: nn.Module, + device: Optional[str] = None, + output_path: str = "model-snapshot.tc", +) -> str: + """ + Export the model as snapshot. + + Args: + model: The model to be exported. + device: The device to run the model on. + output_path: The path to save the exported model. + Returns: + The path to the exported model. + """ + assert output_path.endswith(".tc"), "use .tc extension for snapshots" + torch.save(model, output_path) + return output_path + + """ Export for Server """ @@ -72,6 +97,7 @@ def export_for_server( "aot_inductor.package": package, "aot_inductor.metadata": metadata or {}, } + if not package: options = {"aot_inductor.output_path": output_path} @@ -373,6 +399,7 @@ def main(args): output_pte_path = args.output_pte_path output_dso_path = args.output_dso_path + output_snapshot_path = args.output_snapshot_path output_aoti_package_path = args.output_aoti_package_path if output_pte_path and builder_args.device != "cpu": @@ -380,7 +407,7 @@ def main(args): f"Warning! ExecuTorch export target is controlled by export recipe, not device setting. Ignoring device={builder_args.device} setting." ) builder_args.device = "cpu" - elif "mps" in builder_args.device: + elif (output_pte_path or output_dso_path or output_aoti_package_path) and "mps" in builder_args.device: print("Warning! Device MPS not supported for export. Exporting for device CPU.") builder_args.device = "cpu" @@ -417,6 +444,7 @@ def main(args): model_to_pte = model model_to_dso = model model_to_aoti_package = model + model_to_snapshot = model else: if output_pte_path: _set_gguf_kwargs(builder_args, is_et=True, context="export") @@ -436,6 +464,15 @@ def main(args): model_to_dso = model_to_aoti_package _unset_gguf_kwargs(builder_args) + if output_snapshot_path: + _set_gguf_kwargs(builder_args, is_et=False, context="export") + model_to_snapshot = _initialize_model( + builder_args, + quantize, + support_tensor_subclass=False, + ) + _unset_gguf_kwargs(builder_args) + with torch.no_grad(): if output_pte_path: output_pte_path = str(os.path.abspath(output_pte_path)) @@ -453,13 +490,14 @@ def main(args): print( "WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead." ) - export_for_server( - model_to_dso, - builder_args.device, - output_dso_path, - builder_args.dynamic_shapes, - package=False, - ) + with torch.nn.attention.sdpa_kernel([builder_args.attention_backend]): + export_for_server( + model_to_dso, + builder_args.device, + output_dso_path, + builder_args.dynamic_shapes, + package=False, + ) if output_aoti_package_path: output_aoti_package_path = str(os.path.abspath(output_aoti_package_path)) @@ -475,11 +513,21 @@ def main(args): print( "Exporting model using AOT Inductor to " f"{output_aoti_package_path}." ) - export_for_server( - model_to_aoti_package, + with torch.nn.attention.sdpa_kernel([builder_args.attention_backend]): + export_for_server( + model_to_aoti_package, + builder_args.device, + output_aoti_package_path, + builder_args.dynamic_shapes, + package=True, + metadata=metadata, + ) + + if output_snapshot_path: + output_snapshot_path = str(os.path.abspath(output_snapshot_path)) + print(f"Exporting model using Snapshot to {output_snapshot_path}") + export_snapshot( + model_to_snapshot, builder_args.device, - output_aoti_package_path, - builder_args.dynamic_shapes, - package=True, - metadata=metadata, + output_snapshot_path, ) diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json index d2252e6dd..3c2161b9b 100644 --- a/torchchat/model_config/models.json +++ b/torchchat/model_config/models.json @@ -51,6 +51,12 @@ "distribution_path": "meta-llama/Meta-Llama-3.1-8B-Instruct", "transformer_params_key": "Meta-Llama-3.1-8B" }, + "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": { + "aliases": ["deepseek-r1:8b"], + "distribution_channel": "HuggingFaceSnapshot", + "distribution_path": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "tokenizer_file": "tokenizer.json" + }, "meta-llama/Meta-Llama-3.1-70B-Instruct": { "aliases": ["llama3.1-70b"], "distribution_channel": "HuggingFaceSnapshot", diff --git a/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json new file mode 100644 index 000000000..b9fa79cd2 --- /dev/null +++ b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json @@ -0,0 +1 @@ +{"block_size": 131072, "dim": 4096, "ffn_dim_multiplier": 1.3, "multiple_of": 1024, "n_heads": 32, "n_local_heads": 8, "n_layers": 32, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true, "use_hf_tokenizer": true, "norm_eps": 1e-05, "rope_scaling": {"factor": 8.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, "original_max_position_embeddings": 8192}} diff --git a/torchchat/quant_config/cuda-32.json b/torchchat/quant_config/cuda-32.json new file mode 100644 index 000000000..90c37250a --- /dev/null +++ b/torchchat/quant_config/cuda-32.json @@ -0,0 +1,5 @@ +{ + "executor": {"accelerator": "cuda"}, + "precision": {"dtype": "bf16"}, + "linear:int4": {"groupsize" : 32} +} diff --git a/torchchat/quant_config/mobile-32.json b/torchchat/quant_config/mobile-32.json new file mode 100644 index 000000000..3afaa7542 --- /dev/null +++ b/torchchat/quant_config/mobile-32.json @@ -0,0 +1,4 @@ +{ + "embedding": {"bitwidth": 4, "groupsize" : 32}, + "linear:a8w4dq": {"groupsize" : 32} +}