diff --git a/.github/workflows/_example_tests_runner.yml b/.github/workflows/_example_tests_runner.yml index 5aa0614c71..992b4127db 100644 --- a/.github/workflows/_example_tests_runner.yml +++ b/.github/workflows/_example_tests_runner.yml @@ -47,10 +47,6 @@ jobs: echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV - name: Install dependencies run: | - # Install git-lfs for Daring-Anteater dataset - apt-get update && apt-get install -y git-lfs - git lfs install --system - # use `python -m pip` instead of `pip` to avoid conflicts with system pip for nemo containers python -m pip install ".${{ inputs.pip_install_extras }}" diff --git a/.github/workflows/delete_outdated_pr_branches.yml b/.github/workflows/delete_outdated_pr_branches.yml deleted file mode 100644 index 532b5c5b7d..0000000000 --- a/.github/workflows/delete_outdated_pr_branches.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: Delete Outdated PR Branches - -on: - schedule: - - cron: "0 9 * * 1" # Every Monday at 9:00 UTC - workflow_dispatch: # On-demand - -permissions: - contents: write - pull-requests: read - -jobs: - delete-outdated-pr-branches: - runs-on: ubuntu-latest - timeout-minutes: 15 - steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Delete branches for closed/merged PRs - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - REPO="${{ github.repository }}" - DELETED=0 - SKIPPED=0 - - # List all remote branches matching pull-request/ - git fetch --prune origin - for branch in $(git branch -r | grep -oP 'origin/pull-request/\K[0-9]+' | sort -un); do - FULL_BRANCH="pull-request/${branch}" - STATE=$(gh pr view "$branch" --repo "$REPO" --json state --jq '.state' 2>/dev/null || echo "") - - if [ "$STATE" = "CLOSED" ] || [ "$STATE" = "MERGED" ]; then - echo "Deleting branch '${FULL_BRANCH}' (PR #${branch} is ${STATE})" - git push origin --delete "$FULL_BRANCH" && DELETED=$((DELETED + 1)) || true - elif [ "$STATE" = "OPEN" ]; then - echo "Skipping branch '${FULL_BRANCH}' (PR #${branch} is still OPEN)" - SKIPPED=$((SKIPPED + 1)) - else - echo "Skipping branch '${FULL_BRANCH}' (could not determine PR #${branch} state)" - SKIPPED=$((SKIPPED + 1)) - fi - done - - echo "" - echo "Done. Deleted: ${DELETED}, Skipped: ${SKIPPED}" diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml index f3f3908043..f8ef06c7db 100644 --- a/.github/workflows/example_tests.yml +++ b/.github/workflows/example_tests.yml @@ -70,9 +70,9 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3" + docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3" example: ${{ matrix.example }} - timeout_minutes: 30 + timeout_minutes: 45 pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-h100-latest-1 @@ -82,9 +82,9 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3" + docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3" example: ${{ matrix.example }} - timeout_minutes: 30 + timeout_minutes: 45 pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-2 @@ -99,7 +99,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5" + docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-1 @@ -113,7 +113,7 @@ jobs: uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: - docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5" + docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10" example: ${{ matrix.example }} pip_install_extras: "[hf,dev-test]" runner: linux-amd64-gpu-rtxpro6000-latest-2 diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 538e05e75f..d24e04e317 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -65,18 +65,19 @@ jobs: - example: gpu timeout: 45 container_image: pytorch:26.01-py3 + # tests/gpu/_extensions/test_onnx_extensions.py fails for newer containers until https://github.com/tbenthompson/cppimport/pull/98 - example: gpu-megatron timeout: 45 container_image: pytorch:26.01-py3 - example: gpu-trtllm timeout: 30 - container_image: tensorrt-llm/release:1.3.0rc5 + container_image: tensorrt-llm/release:1.3.0rc10 runs-on: linux-amd64-gpu-rtxpro6000-latest-1 timeout-minutes: ${{ matrix.timeout }} container: &gpu_container image: nvcr.io/nvidia/${{ matrix.container_image }} env: - GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py + GIT_DEPTH: 1000 # For correct version PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: &gpu_steps diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index f2e862df32..605f930f2b 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -38,7 +38,7 @@ jobs: - uses: actions/checkout@v6 - uses: ./.github/actions/ubuntu-setup - name: Run unit tests - run: pip install tox && COV_ARGS="--cov" tox -e py312-torch210-tf_latest-unit + run: pip install tox && COV_ARGS="--cov" tox -e py312-torch211-tf_latest-unit - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v5 with: @@ -64,6 +64,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 30 strategy: + fail-fast: false matrix: py: [10, 11, 13] steps: @@ -72,15 +73,16 @@ jobs: with: python-version: "3.${{ matrix.py }}" - name: Run unit tests - run: pip install tox && tox -e py3${{ matrix.py }}-torch210-tf_latest-unit + run: pip install tox && tox -e py3${{ matrix.py }}-torch211-tf_latest-unit multi-torch: if: github.event_name == 'pull_request' needs: [linux] runs-on: ubuntu-latest timeout-minutes: 30 strategy: + fail-fast: false matrix: - torch: [26, 27, 28, 29] + torch: [28, 29, 210] steps: - uses: actions/checkout@v6 - uses: ./.github/actions/ubuntu-setup @@ -92,13 +94,14 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 30 strategy: + fail-fast: false matrix: tf: [min] steps: - uses: actions/checkout@v6 - uses: ./.github/actions/ubuntu-setup - name: Run unit tests - run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit + run: pip install tox && tox -e py312-torch211-tf_${{ matrix.tf }}-unit launcher: if: github.event_name == 'pull_request' needs: [linux] @@ -122,6 +125,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 30 strategy: + fail-fast: false matrix: test-env: [onnx, torch] steps: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d52ad0c2ad..3a0a4aab8d 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,5 +1,6 @@ -NVIDIA Model Optimizer Changelog -================================ +Changelog +========= + 0.44 (2026-05-xx) ^^^^^^^^^^^^^^^^^ @@ -15,6 +16,11 @@ NVIDIA Model Optimizer Changelog - Fix Minitron pruning (``mcore_minitron``) for MoE models. Importance estimation hooks were incorrectly registered for MoE modules and NAS step was hanging before this. +**Misc** + +- Bump minimum required PyTorch version to 2.8. +- Add experimental support for transformers>=5.0. Unified Hugging Face checkpoint export for quantized checkpoints may not work for some models with transformers>=5.0 yet. + 0.43 (2026-04-09) ^^^^^^^^^^^^^^^^^ diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst index 8071c34af3..2b2d4d8219 100644 --- a/docs/source/getting_started/_installation_for_Linux.rst +++ b/docs/source/getting_started/_installation_for_Linux.rst @@ -16,7 +16,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system +-------------------------+-----------------------------+ | CUDA | 12.x, 13.x | +-------------------------+-----------------------------+ -| PyTorch | >=2.6 | +| PyTorch | >=2.8 | +-------------------------+-----------------------------+ | TensorRT-LLM (Optional) | >=1.0 | +-------------------------+-----------------------------+ diff --git a/examples/gpt-oss/configs/sft_full.yaml b/examples/gpt-oss/configs/sft_full.yaml index 33273c1e92..c3ba873be2 100644 --- a/examples/gpt-oss/configs/sft_full.yaml +++ b/examples/gpt-oss/configs/sft_full.yaml @@ -16,7 +16,7 @@ per_device_train_batch_size: 2 per_device_eval_batch_size: 2 gradient_accumulation_steps: 2 max_length: 4096 -warmup_ratio: 0.03 +warmup_steps: 0.03 # use warmup_ratio if using transformers<5.0 lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 @@ -30,6 +30,6 @@ eval_steps: 8 dataset_test_split: test # ModelOpt Quantization Parameters -quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG - # For the full list of supported configs, do: mtq.config.choices +quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG + # For the full list of supported configs, do: mtq.config.choices calib_size: 128 diff --git a/examples/gpt-oss/configs/sft_lora.yaml b/examples/gpt-oss/configs/sft_lora.yaml index 34f76a6e71..4f35c36182 100644 --- a/examples/gpt-oss/configs/sft_lora.yaml +++ b/examples/gpt-oss/configs/sft_lora.yaml @@ -21,7 +21,7 @@ lora_alpha: 16 lora_dropout: 0.0 lora_target_modules: all-linear max_length: 4096 -warmup_ratio: 0.03 +warmup_steps: 0.03 # use warmup_ratio if using transformers<5.0 lr_scheduler_type: cosine_with_min_lr lr_scheduler_kwargs: min_lr_rate: 0.1 @@ -35,6 +35,6 @@ eval_steps: 8 dataset_test_split: test # ModelOpt Quantization Parameters -quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG - # For the full list of supported configs, do: mtq.config.choices +quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG + # For the full list of supported configs, do: mtq.config.choices calib_size: 128 diff --git a/examples/gpt-oss/convert_oai_mxfp4_weight_only.py b/examples/gpt-oss/convert_oai_mxfp4_weight_only.py index bebb914869..4f471ef484 100644 --- a/examples/gpt-oss/convert_oai_mxfp4_weight_only.py +++ b/examples/gpt-oss/convert_oai_mxfp4_weight_only.py @@ -95,21 +95,23 @@ def convert_and_save(model, tokenizer, output_path: str): def create_parser(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--model_path", type=str, help="path to the fake-quantized model from QAT.") - + parser.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) parser.add_argument( "--lora_path", type=str, help="path to the LoRA-QAT adapter weights. You can only specify lora_path or model_path, not both.", ) - parser.add_argument( "--base_path", type=str, help="path to the base model used for LoRA-QAT. Only used if lora_path is specified.", ) - parser.add_argument( "--output_path", type=str, required=True, help="location to save converted model." ) @@ -121,7 +123,7 @@ def create_parser(): parser = create_parser() args = parser.parse_args() - kwargs = {"device_map": "auto", "torch_dtype": "auto", "trust_remote_code": True} + kwargs = {"device_map": "auto", "dtype": "auto", "trust_remote_code": args.trust_remote_code} if args.lora_path: assert args.model_path is None, "You can only specify lora_path or model_path, not both." model_path = args.base_path @@ -140,7 +142,7 @@ def create_parser(): gc.collect() # Load tokenizer - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=args.trust_remote_code) # Quantize and save model convert_and_save(model, tokenizer, args.output_path) diff --git a/examples/gpt-oss/qat-finetune-transformers.ipynb b/examples/gpt-oss/qat-finetune-transformers.ipynb index 695ed39f67..42226b2982 100644 --- a/examples/gpt-oss/qat-finetune-transformers.ipynb +++ b/examples/gpt-oss/qat-finetune-transformers.ipynb @@ -207,7 +207,7 @@ " per_device_eval_batch_size=1,\n", " gradient_accumulation_steps=2,\n", " max_length=4096,\n", - " warmup_ratio=0.03,\n", + " warmup_steps=0.03, # use warmup_ratio if using transformers<5.0\n", " eval_strategy=\"steps\",\n", " eval_on_start=True,\n", " logging_steps=10,\n", diff --git a/examples/gpt-oss/requirements.txt b/examples/gpt-oss/requirements.txt index 368097d337..d18f9eb539 100644 --- a/examples/gpt-oss/requirements.txt +++ b/examples/gpt-oss/requirements.txt @@ -1,5 +1,3 @@ kernels>=0.9.0 -torch>2.7.1 trackio -transformers>=4.55.0 trl>=0.21.0 diff --git a/examples/gpt-oss/sft.py b/examples/gpt-oss/sft.py index cc896021fa..6cdad5187c 100644 --- a/examples/gpt-oss/sft.py +++ b/examples/gpt-oss/sft.py @@ -72,7 +72,7 @@ def main(script_args, training_args, model_args, quant_args): "revision": model_args.model_revision, "trust_remote_code": model_args.trust_remote_code, "attn_implementation": model_args.attn_implementation, - "torch_dtype": getattr(model_args, "dtype", "bfloat16"), + "dtype": getattr(model_args, "dtype", "bfloat16"), "use_cache": not training_args.gradient_checkpointing, } diff --git a/examples/llm_autodeploy/run_auto_quantize.py b/examples/llm_autodeploy/run_auto_quantize.py index e9ecb0731f..389d8207b0 100644 --- a/examples/llm_autodeploy/run_auto_quantize.py +++ b/examples/llm_autodeploy/run_auto_quantize.py @@ -118,10 +118,11 @@ def modelopt_ptq( auto_quantize_bits: float | None = None, calib_dataset: str = "cnn_dailymail", calib_batch_size: int = 8, + trust_remote_code: bool = False, ) -> torch.nn.Module: """Quantize the model with modelopt.""" model = AutoModelForCausalLM.from_pretrained( - model_path, trust_remote_code=True, torch_dtype="auto", device_map="auto" + model_path, trust_remote_code=trust_remote_code, dtype="auto", device_map="auto" ) model.eval() @@ -129,7 +130,7 @@ def modelopt_ptq( model_path, model_max_length=2048, padding_side="left", - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) # sanitize tokenizer if tokenizer.pad_token != "": @@ -203,6 +204,12 @@ def modelopt_ptq( "regular quantization without auto_quantize search will be applied." ), ) + parser.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) args = parser.parse_args() @@ -213,4 +220,5 @@ def modelopt_ptq( args.num_samples, auto_quantize_bits=args.effective_bits, calib_batch_size=args.calib_batch_size, + trust_remote_code=args.trust_remote_code, ) diff --git a/examples/llm_distill/requirements.txt b/examples/llm_distill/requirements.txt index 91dda9dafd..4bcd190839 100644 --- a/examples/llm_distill/requirements.txt +++ b/examples/llm_distill/requirements.txt @@ -1,4 +1,3 @@ pyarrow torchao>=0.14.1 -transformers<5.0 trl>=0.23.0 diff --git a/examples/llm_eval/lm_eval_hf.py b/examples/llm_eval/lm_eval_hf.py index 405e8590a5..11d736a429 100755 --- a/examples/llm_eval/lm_eval_hf.py +++ b/examples/llm_eval/lm_eval_hf.py @@ -38,6 +38,7 @@ # limitations under the License. import warnings +import datasets from lm_eval import utils from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser from lm_eval.api.model import T @@ -180,8 +181,6 @@ def setup_parser_with_modelopt_args(): model_args = utils.simple_parse_args_string(args.model_args) if args.trust_remote_code: - import datasets - datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True model_args["trust_remote_code"] = True args.trust_remote_code = None diff --git a/examples/llm_eval/modeling.py b/examples/llm_eval/modeling.py index d06d055603..71e048e1a3 100644 --- a/examples/llm_eval/modeling.py +++ b/examples/llm_eval/modeling.py @@ -74,6 +74,7 @@ class EvalModel(BaseModel, arbitrary_types_allowed=True): model_path: str + trust_remote_code: bool = False max_input_length: int = 512 max_output_length: int = 512 dtype: str = "auto" @@ -92,7 +93,6 @@ def load(self): class OpenAIModel(EvalModel): - model_path: str engine: str = "" use_azure: bool = False tokenizer: tiktoken.Encoding | None @@ -173,7 +173,6 @@ def handler(signum, frame): class SeqToSeqModel(EvalModel): - model_path: str model: PreTrainedModel | None = None tokenizer: PreTrainedTokenizer | None = None lora_path: str = "" @@ -188,10 +187,12 @@ def load(self): args.update(device_map="auto") if self.load_8bit: args.update(device_map="auto", load_in_8bit=True) - args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") + args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") if self.attn_implementation: args["attn_implementation"] = self.attn_implementation - self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args) + self.model = AutoModelForSeq2SeqLM.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code, **args + ) print_gpu_utilization() if self.lora_path: self.model = PeftModel.from_pretrained(self.model, self.lora_path) @@ -199,7 +200,9 @@ def load(self): if "device_map" not in args: self.model.to(self.device) if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -243,11 +246,11 @@ def load(self): args.update(device_map="auto") if self.load_8bit: args.update(device_map="auto", load_in_8bit=True) - args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") + args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") if self.attn_implementation: args["attn_implementation"] = self.attn_implementation self.model = AutoModelForCausalLM.from_pretrained( - self.model_path, trust_remote_code=True, **args + self.model_path, trust_remote_code=self.trust_remote_code, **args ) self.model.eval() if "device_map" not in args: @@ -256,7 +259,9 @@ def load(self): # Sampling with temperature will cause MMLU to drop self.model.generation_config.do_sample = False if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -322,7 +327,7 @@ def load(self): args.update(device_map="auto") if self.load_8bit: args.update(device_map="auto", load_in_8bit=True) - args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") + args.update(dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") self.model = LlamaForCausalLM.from_pretrained(self.model_path, **args) print_gpu_utilization() if self.lora_path: @@ -487,10 +492,12 @@ def test_max_length(self): class ChatGLMModel(SeqToSeqModel): def load(self): if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) if self.model is None: self.model = AutoModel.from_pretrained( - self.model_path, trust_remote_code=True + self.model_path, trust_remote_code=self.trust_remote_code ).half() # FP16 is required for ChatGLM self.model.eval() self.model.to(self.device) diff --git a/examples/llm_eval/requirements.txt b/examples/llm_eval/requirements.txt index 88faeac5ca..df47ac76c6 100644 --- a/examples/llm_eval/requirements.txt +++ b/examples/llm_eval/requirements.txt @@ -2,5 +2,4 @@ fire>=0.5.0 lm_eval[api,ifeval]==0.4.8 peft>=0.5.0 rwkv>=0.7.3 -tiktoken torchvision diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index 4d22390763..0bba1d71e1 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -115,7 +115,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http | Kimi K2 | - | - | - | - | ✅ | | MiniMax M2.1 | - | - | - | - | ✅ | | T5 | ✅ | ✅ | ✅ | ✅ | - | -| Whisper | ✅ | ❌ | ❌ | ❌ | - | +| Whisper9 | ✅ | ❌ | ❌ | ❌ | - | | Nemotron-3 | ✅ | ❌ | ❌ | ❌ | ✅ | > *This is a subset of the models supported. For the full list please check the [TensorRT-LLM support matrix](https://nvidia.github.io/TensorRT-LLM/reference/precision.html#support-matrix)* @@ -127,7 +127,8 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http > *5.A selective set of the popular models are internally tested. The actual model support list may be longer. NVFP4 inference requires Blackwell GPUs and TensorRT-LLM v0.17 or later* \ > *6.Some models currently support export to HF format only.* \ > *7.[PTQ for DeepSeek](../deepseek/README.md)* \ -> *8.GLM-4.7 has MTP (Multi-Token Prediction) layers that are automatically loaded and excluded from quantization.* +> *8.GLM-4.7 has MTP (Multi-Token Prediction) layers that are automatically loaded and excluded from quantization.* \ +> *9.Running Whisper model with transformers>=5.0 requires [torchcodec](https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-cuda-enabled-torchcodec) and other system packages (e.g. ffmpeg).* > *The accuracy loss after PTQ may vary depending on the actual model and the quantization method. Different models may have different accuracy loss and usually the accuracy loss is more significant when the base model is small. If the accuracy after PTQ is not meeting the requirement, please try either modifying [hf_ptq.py](./hf_ptq.py) and disabling the KV cache quantization or using the [QAT](./../llm_qat/README.md) instead. For NVFP4 quantization specifically, we recommend `nvfp4_mlp_only`, `nvfp4_experts_only`, or `nvfp4_omlp_only` to achieve higher accuracy by restricting quantization to the MLP/expert layers (and optionally the `o_proj` layer) while keeping the attention QKV projections unquantized.* diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 58eb676111..a4515baacb 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -53,7 +53,13 @@ def run_nemotron_vl_preview( - full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False + full_model, + tokenizer, + input_ids, + pyt_ckpt_path, + stage_name, + allow_fallback=False, + trust_remote_code=False, ): """Run text-only and VL preview generation for Nemotron VL models. @@ -64,7 +70,7 @@ def run_nemotron_vl_preview( pyt_ckpt_path: Path to the model checkpoint stage_name: Description of the stage (e.g., "before quantization", "after quantization") allow_fallback: Whether to allow fallback to standard generate on failure - + trust_remote_code: Whether to trust remote code for Huggingface models and tokenizers Returns: Generated text response or None if generation failed """ @@ -80,7 +86,7 @@ def run_nemotron_vl_preview( # Try text-only generation (may fail for encoder-decoder models like Nemotron-Parse) text_response = run_text_only_generation( - full_model, tokenizer, question, generation_config, pyt_ckpt_path + full_model, tokenizer, question, generation_config, pyt_ckpt_path, trust_remote_code ) generated_ids = None @@ -93,7 +99,7 @@ def run_nemotron_vl_preview( # Run additional VL test with images print(f"Running additional VL test with images ({stage_name})...") - run_vl_preview_generation(full_model, tokenizer, pyt_ckpt_path, stage_name) + run_vl_preview_generation(full_model, tokenizer, pyt_ckpt_path, stage_name, trust_remote_code) return generated_ids @@ -567,7 +573,7 @@ def get_model( model_kwargs = config_kwargs.copy() # Don't set torch_dtype for VILA models as they handle it explicitly in their builder if "vila" not in ckpt_path.lower(): - model_kwargs.setdefault("torch_dtype", "auto") + model_kwargs.setdefault("dtype", "auto") if "vila" in ckpt_path.lower(): hf_vila = AutoModel.from_pretrained( @@ -618,7 +624,7 @@ def has_pack_quantized_config(config): ckpt_path, device_map="auto", trust_remote_code=trust_remote_code, - torch_dtype="auto", + dtype="auto", ) else: architecture = hf_config.architectures[0] @@ -650,7 +656,7 @@ def has_pack_quantized_config(config): model_kwargs2 = model_kwargs.copy() if auto_model_module not in [AutoModelForCausalLM, AutoModel]: model_kwargs2.pop("trust_remote_code", None) - model_kwargs2["torch_dtype"] = torch_dtype + model_kwargs2["dtype"] = torch_dtype model_kwargs2.pop("max_memory", None) model = from_config(hf_config, **model_kwargs2) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index b81dc60c01..54e0984c71 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -758,6 +758,7 @@ def pre_quantize( args.pyt_ckpt_path, "before quantization", allow_fallback=False, + trust_remote_code=args.trust_remote_code, ) else: generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) @@ -808,6 +809,7 @@ def post_quantize( args.pyt_ckpt_path, "after quantization", allow_fallback=False, + trust_remote_code=args.trust_remote_code, ) else: warnings.warn( diff --git a/examples/llm_ptq/multinode_ptq.py b/examples/llm_ptq/multinode_ptq.py index 624307cda2..93ef21ea4d 100644 --- a/examples/llm_ptq/multinode_ptq.py +++ b/examples/llm_ptq/multinode_ptq.py @@ -149,9 +149,7 @@ def load_and_prepare_model( Tuple of (prepared_model, model_type, original_architectures, calibration_dataloader) """ model = AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype="auto", - trust_remote_code=trust_remote_code, + model_path, dtype="auto", trust_remote_code=trust_remote_code ) model.eval() model_type = get_model_type(model) diff --git a/examples/llm_ptq/requirements-t5.txt b/examples/llm_ptq/requirements-t5.txt deleted file mode 100644 index 0347135464..0000000000 --- a/examples/llm_ptq/requirements-t5.txt +++ /dev/null @@ -1 +0,0 @@ -transformers==4.48.0 diff --git a/examples/llm_ptq/requirements-whisper.txt b/examples/llm_ptq/requirements-whisper.txt deleted file mode 100644 index a79b19aeee..0000000000 --- a/examples/llm_ptq/requirements-whisper.txt +++ /dev/null @@ -1,2 +0,0 @@ -librosa -soundfile diff --git a/examples/llm_ptq/requirements.txt b/examples/llm_ptq/requirements.txt index 1469d5552b..51f4b48625 100644 --- a/examples/llm_ptq/requirements.txt +++ b/examples/llm_ptq/requirements.txt @@ -2,6 +2,6 @@ compressed-tensors==0.12.0 fire flash-attn>=2.6.0 rouge_score>=0.1.2 -tiktoken +transformers<5.0 transformers_stream_generator zstandard diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py index 9919e405ba..abfebbd4f0 100644 --- a/examples/llm_ptq/vlm_utils.py +++ b/examples/llm_ptq/vlm_utils.py @@ -21,7 +21,7 @@ from transformers import AutoImageProcessor, AutoProcessor -def run_vl_preview_generation(model, tokenizer, model_path, stage_name): +def run_vl_preview_generation(model, tokenizer, model_path, stage_name, trust_remote_code=False): """Run preview generation for VL models using sample images. Args: @@ -29,7 +29,7 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): tokenizer: The tokenizer model_path: Path to the model (for loading image processor) stage_name: Description of the stage (e.g., "before quantization") - + trust_remote_code: Whether to trust remote code for Huggingface models and tokenizers Returns: Generated response text for logging/comparison """ @@ -85,7 +85,9 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): # Try to detect the VL model has chat method or generate method if hasattr(model, "chat"): - image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True) + image_processor = AutoImageProcessor.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) image_features = image_processor([image]) # Pass as list with single image @@ -103,7 +105,9 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): **image_features, ) else: - processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) + processor = AutoProcessor.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) # Use chat template if available, otherwise fall back to default task prompt if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None: @@ -190,7 +194,9 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): return None -def run_text_only_generation(model, tokenizer, question, generation_config, model_path): +def run_text_only_generation( + model, tokenizer, question, generation_config, model_path, trust_remote_code=False +): """Run text-only generation for VL models, supporting both chat and generate methods. Args: @@ -199,7 +205,7 @@ def run_text_only_generation(model, tokenizer, question, generation_config, mode question: The text question to ask generation_config: Generation configuration model_path: Path to the model (for loading processor if needed) - + trust_remote_code: Whether to trust remote code for Huggingface models and tokenizers Returns: Generated response text or None if failed """ @@ -209,7 +215,9 @@ def run_text_only_generation(model, tokenizer, question, generation_config, mode response = model.chat(tokenizer, None, question, generation_config, history=None) return response else: - processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) + processor = AutoProcessor.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) # Create text-only messages messages = [ diff --git a/examples/llm_qad/data_utils/download_dataset.py b/examples/llm_qad/data_utils/download_dataset.py index e3e3d0646e..42ef6280e1 100644 --- a/examples/llm_qad/data_utils/download_dataset.py +++ b/examples/llm_qad/data_utils/download_dataset.py @@ -30,14 +30,14 @@ _TOKENIZER = None -def init_tokenizer(name: str) -> None: +def init_tokenizer(name: str, trust_remote_code: bool = False) -> None: """Load HuggingFace tokenizer for chat template.""" global _TOKENIZER if name: from transformers import AutoTokenizer print(f"Loading tokenizer: {name}") - _TOKENIZER = AutoTokenizer.from_pretrained(name, trust_remote_code=True) + _TOKENIZER = AutoTokenizer.from_pretrained(name, trust_remote_code=trust_remote_code) def format_text(messages: list[dict], reasoning: str = "") -> str: @@ -159,10 +159,16 @@ def main(): p.add_argument( "--include-reasoning", action="store_true", help="Include COT for Thinking models" ) + p.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) args = p.parse_args() if args.tokenizer: - init_tokenizer(args.tokenizer) + init_tokenizer(args.tokenizer, args.trust_remote_code) # Build suffix suffix = f"{int(args.sample_percent)}pct" diff --git a/examples/llm_qat/launch.sh b/examples/llm_qat/launch.sh index 6120476f17..cc3adc74fe 100755 --- a/examples/llm_qat/launch.sh +++ b/examples/llm_qat/launch.sh @@ -165,7 +165,7 @@ CMD="accelerate launch --config-file accelerate_config/$CONFIG_FILE $FSDP_ARGS \ --save_total_limit 2 \ --learning_rate $LR \ --weight_decay 0.0 \ - --warmup_ratio 0.1 \ + --warmup_steps 0.1 \ --lr_scheduler_type linear \ --logging_steps 1 \ --report_to tensorboard \ diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py index 9435157259..2edbf3ccbb 100644 --- a/examples/llm_qat/main.py +++ b/examples/llm_qat/main.py @@ -166,9 +166,7 @@ def train(): print_rank_0(f"Last checkpoint detected: {last_checkpoint}") model = transformers.AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - torch_dtype=torch.bfloat16, + model_args.model_name_or_path, cache_dir=training_args.cache_dir, dtype=torch.bfloat16 ) model.generation_config.do_sample = True tokenizer = transformers.AutoTokenizer.from_pretrained( @@ -223,7 +221,7 @@ def train(): teacher_model = transformers.AutoModelForCausalLM.from_pretrained( model_args.teacher_model, cache_dir=training_args.cache_dir, - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, ) distill_config = { "teacher_model": teacher_model, diff --git a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb index a9bb6589be..f52d596f7c 100644 --- a/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb +++ b/examples/llm_qat/notebooks/QAT_QAD_Walkthrough.ipynb @@ -275,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "0bf60614-99a0-48b0-85a8-1d88cd7c72ba", "metadata": {}, "outputs": [], @@ -290,7 +290,7 @@ " per_device_eval_batch_size=1,\n", " gradient_accumulation_steps=2,\n", " max_length=4096,\n", - " warmup_ratio=0.03,\n", + " warmup_steps=0.03, # use warmup_ratio if using transformers<5.0\n", " eval_strategy=\"steps\",\n", " eval_on_start=True,\n", " logging_steps=50,\n", diff --git a/examples/llm_sparsity/attention_sparsity/hf_sa.py b/examples/llm_sparsity/attention_sparsity/hf_sa.py index ca92e5ebc9..d6c5bd025a 100644 --- a/examples/llm_sparsity/attention_sparsity/hf_sa.py +++ b/examples/llm_sparsity/attention_sparsity/hf_sa.py @@ -111,7 +111,7 @@ def generate_sample_output(model, tokenizer, args): padding=False, ) if torch.cuda.is_available(): - inputs = {k: v.cuda() for k, v in inputs.items()} + inputs = {k: v.to(model.device) for k, v in inputs.items()} # Generate with torch.no_grad(): @@ -143,10 +143,7 @@ def main(args): # No need to specify attn_implementation here — mtsa.sparsify() sets it # automatically ("eager" for pytorch backend, "modelopt_triton" for triton). model = AutoModelForCausalLM.from_pretrained( - args.pyt_ckpt_path, - attn_implementation="eager", - torch_dtype="auto", - device_map="auto", + args.pyt_ckpt_path, attn_implementation="eager", dtype="auto", device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(args.pyt_ckpt_path) diff --git a/examples/llm_sparsity/weight_sparsity/eval.py b/examples/llm_sparsity/weight_sparsity/eval.py index 6b1d4ef17b..a5f2fb91b2 100644 --- a/examples/llm_sparsity/weight_sparsity/eval.py +++ b/examples/llm_sparsity/weight_sparsity/eval.py @@ -129,7 +129,7 @@ def __call__(self, instances: Sequence[dict]) -> dict[str, torch.Tensor]: [instance[key] for instance in instances] for key in ("src_idx", "label_idx") ) - batch_encoded = self.tokenizer.batch_encode_plus( + batch_encoded = self.tokenizer( sources, return_tensors="pt", padding=True, @@ -254,7 +254,7 @@ def main(): dataloader = get_dataloader( accelerator, dataset, tokenizer, args.model_max_length, args.batch_size, shuffle=False ) - model = AutoModelForCausalLM.from_pretrained(args.model_dir, torch_dtype=torch.float16).to( + model = AutoModelForCausalLM.from_pretrained(args.model_dir, dtype=torch.float16).to( accelerator.device ) diff --git a/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py b/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py index 0fb64f9589..2cf7ca3a7a 100644 --- a/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py +++ b/examples/llm_sparsity/weight_sparsity/export_trtllm_ckpt.py @@ -74,7 +74,7 @@ def get_model(ckpt_path, dtype="fp16", trust_remote_code=False): dtype = torch.float32 else: raise NotImplementedError(f"Unknown dtype {dtype}") - model_kwargs = {"torch_dtype": dtype} + model_kwargs = {"dtype": dtype} model = AutoModelForCausalLM.from_pretrained( ckpt_path, device_map="auto", **model_kwargs, trust_remote_code=trust_remote_code diff --git a/examples/llm_sparsity/weight_sparsity/finetune.py b/examples/llm_sparsity/weight_sparsity/finetune.py index 7110846683..6eb199adc5 100644 --- a/examples/llm_sparsity/weight_sparsity/finetune.py +++ b/examples/llm_sparsity/weight_sparsity/finetune.py @@ -297,13 +297,12 @@ def train(): ) last_checkpoint = None - if os.path.isdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: + if os.path.isdir(args.output_dir) and args.do_train and args.resume_from_checkpoint is None: last_checkpoint = get_last_checkpoint(args.output_dir) - if last_checkpoint is not None and args.resume_from_checkpoint is None: + if last_checkpoint is not None: print_rank_0( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this" - " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train" - " from scratch." + " behavior, change the `--output_dir` or pass `--resume_from_checkpoint`." ) model = transformers.AutoModelForCausalLM.from_pretrained( @@ -335,18 +334,12 @@ def train(): # Detecting last checkpoint. last_checkpoint = None - if os.path.isdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: + if os.path.isdir(args.output_dir) and args.do_train and args.resume_from_checkpoint is None: last_checkpoint = get_last_checkpoint(args.output_dir) - if last_checkpoint is None and len(os.listdir(args.output_dir)) > 0: - raise ValueError( - f"Output directory ({args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and args.resume_from_checkpoint is None: + if last_checkpoint is not None: print_rank_0( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this" - " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train" - " from scratch." + " behavior, change the `--output_dir` or pass `--resume_from_checkpoint`." ) # Training diff --git a/examples/llm_sparsity/weight_sparsity/hf_pts.py b/examples/llm_sparsity/weight_sparsity/hf_pts.py index ad8061211d..77574c1c2c 100644 --- a/examples/llm_sparsity/weight_sparsity/hf_pts.py +++ b/examples/llm_sparsity/weight_sparsity/hf_pts.py @@ -40,7 +40,7 @@ def get_calib_dataloader( else: raise NotImplementedError - batch_encoded = tokenizer.batch_encode_plus( + batch_encoded = tokenizer( dataset, return_tensors="pt", padding=True, truncation=True, max_length=block_size ) if device: @@ -98,7 +98,7 @@ def get_model(ckpt_path, dtype="fp16", trust_remote_code=False): dtype = torch.float32 else: raise NotImplementedError(f"Unknown dtype {dtype}") - model_kwargs = {"torch_dtype": dtype} + model_kwargs = {"dtype": dtype} model = AutoModelForCausalLM.from_pretrained( ckpt_path, device_map="auto", **model_kwargs, trust_remote_code=trust_remote_code diff --git a/examples/llm_sparsity/weight_sparsity/launch_finetune.sh b/examples/llm_sparsity/weight_sparsity/launch_finetune.sh index a65e1e6003..7f8e71f255 100755 --- a/examples/llm_sparsity/weight_sparsity/launch_finetune.sh +++ b/examples/llm_sparsity/weight_sparsity/launch_finetune.sh @@ -88,11 +88,11 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \ --save_total_limit 10 \ --learning_rate 2e-5 \ --weight_decay 0.1 \ - --warmup_ratio 0.0 \ + --warmup_steps 0.0 \ --lr_scheduler_type cosine \ --logging_steps 1 \ --fsdp 'full_shard auto_wrap' \ - --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \ + --fsdp_config '{\"transformer_layer_cls_to_wrap\": \"LlamaDecoderLayer\"}' \ --tf32 True \ --modelopt_restore_path $MODELOPT_RESTORE_PATH \ --report_to tensorboard \ diff --git a/examples/specdec_bench/specdec_bench/models/specbench_medusa.py b/examples/specdec_bench/specdec_bench/models/specbench_medusa.py index e483f379c3..0165505d2f 100644 --- a/examples/specdec_bench/specdec_bench/models/specbench_medusa.py +++ b/examples/specdec_bench/specdec_bench/models/specbench_medusa.py @@ -100,7 +100,7 @@ def __init__( self.draft_model_path, model_dir, medusa_num_heads=self.medusa_num_heads, - torch_dtype=torch_dtype, + dtype=torch_dtype, low_cpu_mem_usage=True, ) self.model = self.model.to(self.device) diff --git a/examples/speculative_decoding/README.md b/examples/speculative_decoding/README.md index 2a29f644e6..8d75eb06f8 100644 --- a/examples/speculative_decoding/README.md +++ b/examples/speculative_decoding/README.md @@ -308,7 +308,7 @@ This will modify the model in-place with eagle training forward, making it compa ```python # Create a trainer -trainer = transformers.Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) +trainer = transformers.Trainer(model=model, processing_class=tokenizer, args=training_args, **data_module) trainer._move_model_to_device(model, trainer.args.device) # Enable HF checkpointing so that the saved model will contain the speculative decoding module diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py index a3d1681c4c..449b261c56 100644 --- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py +++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py @@ -85,6 +85,12 @@ def parse_args() -> argparse.Namespace: default=1, help="""Data parallel world size. Number of tasks on SLURM.""", ) + parser.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) return parser.parse_args() @@ -130,11 +136,11 @@ def keep_conversation(entry): dataset = dataset.select(range(args.debug_max_num_conversations)) model = AutoModel.from_pretrained( - args.model, torch_dtype="auto", device_map="auto", trust_remote_code=True + args.model, dtype="auto", device_map="auto", trust_remote_code=args.trust_remote_code ) num_hidden_layers = getattr(model.config, "num_hidden_layers", None) - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.chat_template = tokenizer.chat_template.replace(REMOVE_THINK_CHAT_TEMPLATE, "") diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py index 3369d399c2..880c6b5672 100644 --- a/examples/speculative_decoding/main.py +++ b/examples/speculative_decoding/main.py @@ -185,7 +185,7 @@ def train(): if checkpoint: with patch_transformers5_params_loading(): model = load_vlm_or_llm( - checkpoint, torch_dtype="auto", trust_remote_code=model_args.trust_remote_code + checkpoint, dtype="auto", trust_remote_code=model_args.trust_remote_code ) tokenizer = transformers.AutoTokenizer.from_pretrained( checkpoint, trust_remote_code=model_args.trust_remote_code @@ -197,7 +197,7 @@ def train(): model_args.model_name_or_path, use_fake_base=model_args.use_fake_base_for_offline, use_offline_training=use_offline_training, - torch_dtype="auto", + dtype="auto", device_map="cpu", trust_remote_code=model_args.trust_remote_code, ) diff --git a/examples/speculative_decoding/requirements.txt b/examples/speculative_decoding/requirements.txt index 6324bac62b..409c35f0ed 100644 --- a/examples/speculative_decoding/requirements.txt +++ b/examples/speculative_decoding/requirements.txt @@ -1,2 +1 @@ -accelerate==1.12.0 -transformers==5.0.0rc1 +transformers<5.4 diff --git a/examples/speculative_decoding/scripts/ar_validate.py b/examples/speculative_decoding/scripts/ar_validate.py index d1bf31a1ab..1ad7bec409 100644 --- a/examples/speculative_decoding/scripts/ar_validate.py +++ b/examples/speculative_decoding/scripts/ar_validate.py @@ -55,6 +55,7 @@ def validate_ar(model, tokenizer, ds, steps=3, osl=20, num_samples=80, device=No def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True, help="Path to model directory") + parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code") parser.add_argument("--steps", type=int, default=3, help="Steps for AR validation") parser.add_argument( "--osl", type=int, default=32, help="Output sequence length for AR validation" @@ -72,8 +73,12 @@ def main(): accelerator = Accelerator() # Load model and tokenizer - model = load_vlm_or_llm(args.model_path, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(args.model_path) + model = load_vlm_or_llm( + args.model_path, device_map="auto", trust_remote_code=args.trust_remote_code + ) + tokenizer = AutoTokenizer.from_pretrained( + args.model_path, trust_remote_code=args.trust_remote_code + ) model.eval() model = accelerator.prepare(model) diff --git a/examples/speculative_decoding/scripts/export_hf_checkpoint.py b/examples/speculative_decoding/scripts/export_hf_checkpoint.py index 925f4b73d0..98ea438f1b 100644 --- a/examples/speculative_decoding/scripts/export_hf_checkpoint.py +++ b/examples/speculative_decoding/scripts/export_hf_checkpoint.py @@ -29,6 +29,7 @@ def parse_args(): description="Export a HF checkpoint (with ModelOpt state) for deployment." ) parser.add_argument("--model_path", type=str, default="Path of the trained checkpoint.") + parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code") parser.add_argument( "--export_path", type=str, default="Destination directory for exported files." ) @@ -38,11 +39,8 @@ def parse_args(): mto.enable_huggingface_checkpointing() args = parse_args() -model = load_vlm_or_llm(args.model_path, torch_dtype="auto") +model = load_vlm_or_llm(args.model_path, dtype="auto", trust_remote_code=args.trust_remote_code) model.eval() with torch.inference_mode(): - export_speculative_decoding( - model, - export_dir=args.export_path, - ) + export_speculative_decoding(model, export_dir=args.export_path) print(f"Exported checkpoint to {args.export_path}") diff --git a/examples/speculative_decoding/scripts/send_conversation_vllm.py b/examples/speculative_decoding/scripts/send_conversation_vllm.py index 5101b4e6f9..d1a5ac5c11 100644 --- a/examples/speculative_decoding/scripts/send_conversation_vllm.py +++ b/examples/speculative_decoding/scripts/send_conversation_vllm.py @@ -55,6 +55,12 @@ def parse_args() -> argparse.Namespace: "the local serving engine. This should match the value used by the server." ), ) + parser.add_argument( + "--trust_remote_code", + help="Set trust_remote_code for Huggingface models and tokenizers", + default=False, + action="store_true", + ) ## Client Parameters ## parser.add_argument( "--base-url", @@ -133,7 +139,9 @@ async def main(args: argparse.Namespace) -> None: base_url=args.base_url, ) - tokenizer = AutoTokenizer.from_pretrained(args.model_card, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + args.model_card, trust_remote_code=args.trust_remote_code + ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token bos_token_id = tokenizer.bos_token_id diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py index ec2b1f4033..262f2a5360 100644 --- a/examples/vllm_serve/fakequant_worker.py +++ b/examples/vllm_serve/fakequant_worker.py @@ -49,8 +49,7 @@ def _fakequant_run_prolog_worker(self) -> None: trust_remote_code = os.environ.get("TRUST_REMOTE_CODE", "false").lower() == "true" tokenizer = AutoTokenizer.from_pretrained( - self.model_runner.model_config.tokenizer, - trust_remote_code=trust_remote_code, + self.model_runner.model_config.tokenizer, trust_remote_code=trust_remote_code ) if tokenizer.pad_token != "" or tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token diff --git a/examples/vlm_ptq/requirements-vila.txt b/examples/vlm_ptq/requirements-vila.txt deleted file mode 100644 index 7391a5f268..0000000000 --- a/examples/vlm_ptq/requirements-vila.txt +++ /dev/null @@ -1,3 +0,0 @@ -deepspeed>=0.16.0 -git+https://github.com/bfshi/scaling_on_scales.git -transformers<=4.50.0 diff --git a/examples/vlm_ptq/requirements.txt b/examples/vlm_ptq/requirements.txt new file mode 100644 index 0000000000..180f534118 --- /dev/null +++ b/examples/vlm_ptq/requirements.txt @@ -0,0 +1 @@ +transformers<5.0 diff --git a/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt b/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt index 8409b2f8ea..7108970c7c 100644 --- a/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt +++ b/examples/windows/accuracy_benchmark/kl_divergence_metrics/requirements.txt @@ -3,6 +3,5 @@ accelerate datasets numpy safetensors>=0.4.0 - -torch>=2.0.0 -transformers>=4.30.0 +torch>=2.6.0 +transformers<5.0 diff --git a/examples/windows/accuracy_benchmark/mmlu_benchmark.py b/examples/windows/accuracy_benchmark/mmlu_benchmark.py index 4eb2fd6190..54573e6425 100644 --- a/examples/windows/accuracy_benchmark/mmlu_benchmark.py +++ b/examples/windows/accuracy_benchmark/mmlu_benchmark.py @@ -501,7 +501,11 @@ def evaluate_func(args, subject, dev_df, test_df): tokenizer = get_tokenizer(model_ckpt_path, trust_remote_code=trust_remote_code) model = select_model( - max_input_length=MAX_SEQ_LEN, max_output_length=2, dtype=dtype, **kwargs + max_input_length=MAX_SEQ_LEN, + max_output_length=2, + dtype=dtype, + trust_remote_code=trust_remote_code, + **kwargs, ) assert isinstance(model, EvalModel) if quant_cfg: diff --git a/examples/windows/accuracy_benchmark/modeling.py b/examples/windows/accuracy_benchmark/modeling.py index 273a944c57..f17300be94 100644 --- a/examples/windows/accuracy_benchmark/modeling.py +++ b/examples/windows/accuracy_benchmark/modeling.py @@ -49,6 +49,7 @@ class EvalModel(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) model_path: str + trust_remote_code: bool = False max_input_length: int = 512 max_output_length: int = 512 dtype: str = "auto" @@ -84,7 +85,9 @@ def load(self): args.update(torch_dtype=getattr(torch, self.dtype)) else: args.update(torch_dtype="auto") - self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args) + self.model = AutoModelForSeq2SeqLM.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code, **args + ) print_gpu_utilization() if self.lora_path: self.model = PeftModel.from_pretrained(self.model, self.lora_path) @@ -92,7 +95,9 @@ def load(self): if "device_map" not in args: self.model.to(self.device) if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -143,7 +148,7 @@ def load(self): args.update(device_map="auto", load_in_8bit=True) args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") self.model = AutoModelForCausalLM.from_pretrained( - self.model_path, trust_remote_code=True, **args + self.model_path, trust_remote_code=self.trust_remote_code, **args ) self.model.eval() if "device_map" not in args: @@ -152,7 +157,9 @@ def load(self): # Sampling with temperature will cause MMLU to drop self.model.generation_config.do_sample = False if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() @@ -200,7 +207,7 @@ def load(self): args.update(device_map="auto", load_in_8bit=True) args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto") self.model = AutoAWQForCausalLM.from_quantized( - self.model_path, trust_remote_code=True, **args + self.model_path, trust_remote_code=self.trust_remote_code, **args ) self.model.eval() if "device_map" not in args: @@ -209,7 +216,9 @@ def load(self): # Sampling with temperature will cause MMLU to drop self.model.config.do_sample = False if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_path, trust_remote_code=self.trust_remote_code + ) def run(self, prompt: str, **kwargs) -> str: self.load() diff --git a/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt b/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt index 4bdac071cf..c9eadf1b09 100644 --- a/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt +++ b/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt @@ -8,4 +8,4 @@ pandas sentencepiece>=0.2.1 tokenizers>=0.14.1 torch>=2.6.0 -transformers>=4.53 +transformers<5.0 diff --git a/examples/windows/accuracy_benchmark/requirements.txt b/examples/windows/accuracy_benchmark/requirements.txt index ad4c91cacd..cb3f95140e 100644 --- a/examples/windows/accuracy_benchmark/requirements.txt +++ b/examples/windows/accuracy_benchmark/requirements.txt @@ -1,6 +1,5 @@ datasets>=2.14.5 fire==0.6.0 -fire>=0.5.0 numpy==1.26.4 openai>=0.28.1 pandas==2.2.2 diff --git a/examples/windows/onnx_ptq/genai_llm/quantize.py b/examples/windows/onnx_ptq/genai_llm/quantize.py index d21d1d796b..13f6ac8045 100644 --- a/examples/windows/onnx_ptq/genai_llm/quantize.py +++ b/examples/windows/onnx_ptq/genai_llm/quantize.py @@ -180,7 +180,7 @@ def get_initial_inputs( """ # tokenizer.pad_token = "[PAD]" tokenizer.pad_token = tokenizer.eos_token - encodings_dict = tokenizer.batch_encode_plus(prompt, padding=True) + encodings_dict = tokenizer(prompt, padding=True) # max_length = model.config.max_position_embeddings # input_ids = tokenizer.encode(text, truncation=True, padding='max_length', max_length=max_length) @@ -242,7 +242,7 @@ def get_calib_inputs( # dataset2 = dataset2.shuffle(seed=42) dataset2 = dataset2[column][:calib_size] - batch_encoded = tokenizer.batch_encode_plus( + batch_encoded = tokenizer( dataset2, return_tensors="pt", padding=True, truncation=True, max_length=block_size ) # return_tensors="pt", batch_encoded = batch_encoded.to(device) diff --git a/examples/windows/onnx_ptq/whisper/README.md b/examples/windows/onnx_ptq/whisper/README.md index 8757aaeb53..82ae782200 100644 --- a/examples/windows/onnx_ptq/whisper/README.md +++ b/examples/windows/onnx_ptq/whisper/README.md @@ -174,7 +174,7 @@ These scripts are currently validated with following settings: - Calibration size - 32 - Calibration EPs - \[`cuda`, `cpu`\] - Audio dataset - `librispeech_asr` dataset (32 samples used for calibration, 100+ samples used for WER test) - - `load_dataset("librispeech_asr", "clean", split="test", trust_remote_code=True)` + - `load_dataset("librispeech_asr", "clean", split="test")` - Quantization support for various ONNX files - `encoder_model.onnx`, `decoder_model.onnx`, `decoder_with_past_model.onnx` - The `use_merged` argument in optimum-ORT's Whisper model API is kept False. diff --git a/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py b/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py index 7b3e3d3197..03d2c49801 100644 --- a/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py +++ b/examples/windows/onnx_ptq/whisper/whisper_onnx_quantization.py @@ -275,7 +275,7 @@ def main(args): processor = WhisperProcessor.from_pretrained(args.model_name, cache_dir=args.cache_dir) - asr_dataset = load_dataset("librispeech_asr", "clean", split="test", trust_remote_code=True) + asr_dataset = load_dataset("librispeech_asr", "clean", split="test") # asr_dataset = load_dataset("librispeech_asr", "all", split="test.clean") calib_data = None diff --git a/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py b/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py index 52d56fe048..a1f39b8f04 100644 --- a/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py +++ b/examples/windows/onnx_ptq/whisper/whisper_optimum_ort_inference.py @@ -85,9 +85,7 @@ def main(args): print(f"\n\n-- Content of input audio-file = {prediction}\n\n") if args.run_wer_test: - librispeech_test_clean = load_dataset( - "librispeech_asr", "clean", split="test", trust_remote_code=True - ) + librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test") references = [] predictions = [] diff --git a/examples/windows/torch_onnx/diffusers/qad_example/requirements.txt b/examples/windows/torch_onnx/diffusers/qad_example/requirements.txt index f6aa9bfda7..0aafd11840 100644 --- a/examples/windows/torch_onnx/diffusers/qad_example/requirements.txt +++ b/examples/windows/torch_onnx/diffusers/qad_example/requirements.txt @@ -6,7 +6,5 @@ ltx-trainer @ git+https://github.com/Lightricks/LTX-2.git#subdirectory=packages/ # NVIDIA ModelOpt (quantization & distillation) nvidia-modelopt -pyyaml safetensors -torch>=2.0 diff --git a/modelopt/__init__.py b/modelopt/__init__.py index c64e30b14a..1490782795 100644 --- a/modelopt/__init__.py +++ b/modelopt/__init__.py @@ -15,7 +15,6 @@ """Nvidia Model Optimizer (modelopt).""" -import warnings as _warnings from importlib.metadata import version as _version __version__ = _version("nvidia-modelopt") diff --git a/modelopt/onnx/llm_export_utils/export_utils.py b/modelopt/onnx/llm_export_utils/export_utils.py index 4009b119e7..a6d2b607ac 100644 --- a/modelopt/onnx/llm_export_utils/export_utils.py +++ b/modelopt/onnx/llm_export_utils/export_utils.py @@ -53,7 +53,7 @@ def load_model(self, trust_remote_code: bool = False) -> AutoModelForCausalLM: """Load HuggingFace model based on model type.""" print(f"Loading HF model from {self.hf_model_path} with model type {self.model_type}") self.hf_model = AutoModelForCausalLM.from_pretrained( - self.hf_model_path, torch_dtype=torch.float16, trust_remote_code=trust_remote_code + self.hf_model_path, dtype=torch.float16, trust_remote_code=trust_remote_code ) return self.hf_model.eval().cuda() # type: ignore[attr-defined] @@ -76,7 +76,7 @@ def __init__(self, model): self.lm_head = model.lm_head self.config = model.config - def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple): + def forward(self, input_ids: torch.Tensor, past_key_values: tuple): """Forward pass.""" # Convert tuple cache to DynamicCache for models that require it (e.g., Qwen3) cache = DynamicCache(config=self.config) @@ -84,9 +84,30 @@ def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple): cache.value_cache = [kv[1] for kv in past_key_values] past_key_values = cache - outputs = self.model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True) + # Pre-compute a 4D causal mask so that transformers' internal mask creation + # (which relies on Python-int shapes) is bypassed entirely. During ONNX/JIT tracing, + # tensor.shape[N] can return a 0-dim scalar tensor instead of a Python int, which breaks + # the masking code in transformers>=5.4 + seq_len = input_ids.shape[1] + past_len = past_key_values.get_seq_length() # type: ignore[attr-defined] + causal_mask = ( + torch.tril( + torch.ones(seq_len, past_len + seq_len, dtype=torch.bool, device=input_ids.device), + diagonal=past_len, + ) + .unsqueeze(0) + .unsqueeze(0) + ) + + outputs = self.model( + input_ids=input_ids, + attention_mask=causal_mask, + past_key_values=past_key_values, + use_cache=True, + ) hidden_states = outputs[0] - past_key_values = outputs.past_key_values.to_legacy_cache() + cache = outputs.past_key_values + past_key_values = tuple(zip(cache.key_cache, cache.value_cache)) logits = self.lm_head(hidden_states) return logits, past_key_values diff --git a/modelopt/onnx/quantization/extensions.py b/modelopt/onnx/quantization/extensions.py index 13956eeac3..68facdaac8 100644 --- a/modelopt/onnx/quantization/extensions.py +++ b/modelopt/onnx/quantization/extensions.py @@ -18,6 +18,7 @@ import os import sys +# TODO: cppimport is no longer maintained, switch to a different library import cppimport from modelopt.onnx.logging_config import logger @@ -30,6 +31,8 @@ sys.path.remove(path) except Exception as e: logger.warning( - f"{e}\nUnable to load `modelopt_round_and_pack_ext', falling back to python based optimized version" + f"{e}\nUnable to load `modelopt_round_and_pack_ext', falling back to python based optimized version. " + "If you see `copy_file() got an unexpected keyword argument 'dry_run'`, you will need " + "https://github.com/tbenthompson/cppimport/pull/98 or downgrade setuptools until we have a workaround" ) round_and_pack_ext = None diff --git a/modelopt/torch/__init__.py b/modelopt/torch/__init__.py index ec62b86ffc..190e94529c 100644 --- a/modelopt/torch/__init__.py +++ b/modelopt/torch/__init__.py @@ -22,20 +22,24 @@ from . import distill, nas, opt, peft, prune, quantization, sparsity, speculative, utils -if _Version(_torch_version) < _Version("2.7"): +if _Version(_torch_version) < _Version("2.9"): _warnings.warn( - "nvidia-modelopt will drop torch<2.7 support in a future release.", DeprecationWarning + "nvidia-modelopt will drop torch<2.9 support in a future release.", DeprecationWarning ) -# Since `hf` dependencies are optional and users have pre-installed transformers, we need to ensure -# correct version is installed to avoid incompatibility issues. + try: from transformers import __version__ as _transformers_version - if not (_Version("4.56") <= _Version(_transformers_version) < _Version("5.0")): + if _Version(_transformers_version) < _Version("4.56"): + _warnings.warn( + f"transformers {_transformers_version} is not tested with current version of modelopt and may cause issues." + " Please install recommended version with `pip install -U nvidia-modelopt[hf]` if working with HF models.", + ) + elif _Version(_transformers_version) >= _Version("5.0"): _warnings.warn( - f"transformers version {_transformers_version} is not tested with nvidia-modelopt and may cause issues. " - "Please install recommended version with `pip install nvidia-modelopt[hf]` if working with HF models.", + "transformers>=5.0 support is experimental. Unified Hugging Face checkpoint export for quantized " + "checkpoints may not work for some models yet.", ) except ImportError: pass diff --git a/modelopt/torch/export/model_config_export.py b/modelopt/torch/export/model_config_export.py index b9acb80c8b..ae92e2776f 100644 --- a/modelopt/torch/export/model_config_export.py +++ b/modelopt/torch/export/model_config_export.py @@ -151,7 +151,8 @@ def torch_to_tensorrt_llm_checkpoint( model_metadata_config = model.config.__dict__ vocab_size = model.config.vocab_size hf_config = model.config - architecture = model.config.architectures[0] + architectures = getattr(model.config, "architectures", None) + architecture = architectures[0] if architectures else "" # For Baichuan 13B, we check if alibi is used with the alibi_mask property. if hasattr(model, "model") and hasattr(model.model, "alibi_mask"): diff --git a/modelopt/torch/export/tensorrt_llm_utils.py b/modelopt/torch/export/tensorrt_llm_utils.py index 75708dbcde..f49fcd4899 100755 --- a/modelopt/torch/export/tensorrt_llm_utils.py +++ b/modelopt/torch/export/tensorrt_llm_utils.py @@ -48,6 +48,7 @@ "gemma": "GemmaForCausalLM", "gemma3": "Gemma3ForCausalLM", "gpt": "GPTForCausalLM", + "qwen": "QWenForCausalLM", "enc": "EncoderModel", "dec": "DecoderModel", "mllama": "MLLaMAModel", @@ -240,7 +241,7 @@ def convert_to_tensorrt_llm_config( layernorm_type_map = {i.name: i.value for i in LayerNormType} layernorm_position_map = {i.name: i.value for i in LayerNormPositionType} - if decoder_type in ["gpt", "gemma", "llama"]: + if decoder_type in ["gpt", "gemma", "llama", "qwen"]: pass elif decoder_type == "mpt": config.update( diff --git a/modelopt/torch/opt/plugins/huggingface.py b/modelopt/torch/opt/plugins/huggingface.py index 8b6396f3e7..db077487c0 100644 --- a/modelopt/torch/opt/plugins/huggingface.py +++ b/modelopt/torch/opt/plugins/huggingface.py @@ -23,6 +23,8 @@ from typing import Any import torch +from huggingface_hub import try_to_load_from_cache +from huggingface_hub.errors import HFValidationError from modelopt.torch.utils import print_rank_0 @@ -57,7 +59,16 @@ def register_for_patching(name: str, cls: type, patch_methods: list[tuple[str, A def _get_modelopt_state_path(model_name_or_path: str) -> str: - return os.path.join(model_name_or_path, _MODELOPT_STATE_SAVE_NAME) + """Get the path to the ModelOpt state file or empty string if not found. + + Also handles HF model card as input path. However for hf hub models, we dont have modelopt_state at the moment. + """ + if os.path.isdir(model_name_or_path): + return os.path.join(model_name_or_path, _MODELOPT_STATE_SAVE_NAME) + try: + return try_to_load_from_cache(model_name_or_path, _MODELOPT_STATE_SAVE_NAME) or "" + except HFValidationError: + return "" @contextmanager diff --git a/modelopt/torch/opt/plugins/transformers.py b/modelopt/torch/opt/plugins/transformers.py index 7cfdc8ca0c..9cc729723e 100644 --- a/modelopt/torch/opt/plugins/transformers.py +++ b/modelopt/torch/opt/plugins/transformers.py @@ -15,6 +15,7 @@ """ModelOpt plugin for enabling automatic save/restore of ModelOpt state for HuggingFace models.""" +import os import types from contextlib import contextmanager @@ -24,8 +25,9 @@ from modelopt.torch.utils import report_memory -from ..conversion import ModeloptStateManager +from ..conversion import ModeloptStateManager, load_modelopt_state from .huggingface import ( + _get_modelopt_state_path, _new_save_pretrained, _patch_model_init_for_modelopt, enable_huggingface_checkpointing, @@ -60,6 +62,39 @@ def _undo_torch_init_override_by_transformers(): setattr(torch.nn.init, name, init_func) +def _restore_qtensor_wrappers(model, model_path): + """Re-wrap QTensorWrapper weights that were replaced during HF weight loading. + + Transformers>=5.0 uses ``setattr`` to load weights, which replaces ``QTensorWrapper`` + objects with plain ``Parameter`` tensors. The compressed data is loaded correctly but + the wrapper metadata (original shape, dtype, qtensor class) is lost. This function + reads the saved ``q_tensor_state`` from ``modelopt_state.pth`` and re-wraps the affected + weights. + """ + modelopt_state_path = _get_modelopt_state_path(model_path) + if not os.path.isfile(modelopt_state_path): + return + + from modelopt.torch.quantization.nn.modules.quant_linear import RealQuantLinear + from modelopt.torch.quantization.qtensor import QTensorWrapper + + state = load_modelopt_state(modelopt_state_path) + for _, mode_config in state["modelopt_state_dict"]: + q_tensor_state = mode_config.get("metadata", {}).get("q_tensor_state", {}) + if not q_tensor_state: + continue + for name, module in model.named_modules(): + if ( + isinstance(module, RealQuantLinear) + and name in q_tensor_state + and not isinstance(module.weight, QTensorWrapper) + ): + module._parameters["weight"] = QTensorWrapper( + qtensor=module.weight.data, + metadata=q_tensor_state[name]["metadata"], + ) + + def _new_from_pretrained(cls, /, pretrained_model_name_or_path, *args, **kwargs): """Patch for `cls.from_pretrained` method to restore ModelOpt state.""" with _patch_model_init_for_modelopt( @@ -69,6 +104,8 @@ def _new_from_pretrained(cls, /, pretrained_model_name_or_path, *args, **kwargs) pretrained_model_name_or_path, *args, **kwargs ) + _restore_qtensor_wrappers(model, pretrained_model_name_or_path) + return model @@ -93,12 +130,12 @@ def _save_pretrained_with_checks(self, save_directory, *args, **kwargs): # [Fix for huggingface bug] deepspeed zero3 training backend only loads params into the model from # state_dict, but not buffers. So lets explicitly load the buffers into the model from state_dict. -def _load_params_and_buffers_into_zero3_model(model_to_load, state_dict): +def _load_params_and_buffers_into_zero3_model(model_to_load, state_dict, load_config=None): buffer_names = [name for name, _ in model_to_load.named_buffers()] buffer_state_dict = {k: v for k, v in state_dict.items() if k in buffer_names} model_to_load.load_state_dict(buffer_state_dict, strict=False) return tf_modeling_utils._modelopt_cache["_load_state_dict_into_zero3_model"]( - model_to_load, state_dict + model_to_load, state_dict, load_config ) diff --git a/modelopt/torch/quantization/backends/nvfp4_gemm.py b/modelopt/torch/quantization/backends/nvfp4_gemm.py index ffc18fea33..e7d2b90ff7 100644 --- a/modelopt/torch/quantization/backends/nvfp4_gemm.py +++ b/modelopt/torch/quantization/backends/nvfp4_gemm.py @@ -176,7 +176,7 @@ def backward(ctx, grad_outputs): grad_weight = grad_outputs.reshape(-1, grad_outputs.shape[-1]).T @ input_tensor.reshape( -1, input_tensor.shape[-1] ) - if ctx.compute_bias_grad is not None: + if ctx.compute_bias_grad: # Sum all dimensions except the last one grad_bias = grad_outputs.sum(dim=list(range(grad_outputs.dim() - 1))) diff --git a/modelopt/torch/quantization/nn/modules/quant_linear.py b/modelopt/torch/quantization/nn/modules/quant_linear.py index bcb71e4c93..bb65d59077 100644 --- a/modelopt/torch/quantization/nn/modules/quant_linear.py +++ b/modelopt/torch/quantization/nn/modules/quant_linear.py @@ -246,26 +246,39 @@ def __init__(self, weight_quantizer: TensorQuantizer, *args, **kwargs): self.weight_quantizer = weight_quantizer def __setitem__(self, key, value): - if ( - key == "weight" - and self.weight_quantizer - and self.weight_quantizer.is_enabled - and not self.weight_quantizer._fake_quant - and value.element_size() > 1 - ): - # reset the amax for later calibration + if key == "weight" and not isinstance(value, QTensorWrapper): + existing = self.get("weight") if ( - self.weight_quantizer.amax is not None - and self.weight_quantizer.amax.is_meta + isinstance(existing, QTensorWrapper) + and not existing.is_meta + and existing.shape == value.shape ): - delattr(self.weight_quantizer, "_amax") - self.weight_quantizer.amax = self.weight_quantizer._get_amax(value) - self.weight_quantizer._calibrator.reset() - # compress the weight - real_quant_tensor = self.weight_quantizer(value) - real_quant_value = QTensorWrapper(real_quant_tensor) - del value # delete the original weight to save memory - value = real_quant_value + # Loading a compressed weight (e.g. from safetensors in transformers>=5.0 + # which replaces parameters via setattr rather than copy_). Preserve the + # QTensorWrapper type and metadata. + super().__setitem__( + key, QTensorWrapper(qtensor=value.data, metadata=existing.metadata) + ) + return + if ( + self.weight_quantizer + and self.weight_quantizer.is_enabled + and not self.weight_quantizer._fake_quant + and value.element_size() > 1 + ): + # reset the amax for later calibration + if ( + self.weight_quantizer.amax is not None + and self.weight_quantizer.amax.is_meta + ): + delattr(self.weight_quantizer, "_amax") + self.weight_quantizer.amax = self.weight_quantizer._get_amax(value) + self.weight_quantizer._calibrator.reset() + # compress the weight + real_quant_tensor = self.weight_quantizer(value) + real_quant_value = QTensorWrapper(real_quant_tensor) + del value # delete the original weight to save memory + value = real_quant_value super().__setitem__(key, value) # Monkey patch the _parameters.__setitem__ to real quant the weight when loading diff --git a/modelopt/torch/quantization/plugins/accelerate.py b/modelopt/torch/quantization/plugins/accelerate.py index 59731cc8ad..13999df0f0 100644 --- a/modelopt/torch/quantization/plugins/accelerate.py +++ b/modelopt/torch/quantization/plugins/accelerate.py @@ -190,8 +190,10 @@ def patched_from_pretrained(cls, /, pretrained_model_name_or_path, *args, **kwar with init_empty_weights(): # Fix torch_dtype to match original model - torch_dtype = kwargs.get("torch_dtype", getattr(config, "torch_dtype", torch.float16)) - model = cls.from_config(config, torch_dtype=torch_dtype) + torch_dtype = kwargs.get( + "dtype", kwargs.get("torch_dtype", getattr(config, "torch_dtype", torch.float16)) + ) + model = cls.from_config(config, dtype=torch_dtype) mtq.quantize(model, quant_cfg) mtq.compress(model, config=mtq.CompressConfig(quant_gemm=quant_gemm)) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 0d02716a6e..b40623aa20 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -172,14 +172,20 @@ def forward(self, *args, **kwargs): The forward method is used to patch the attention interface with _quantized_attention. Once output tensors are generated, it restores the original attention interface. """ + # In transformers>=5.0 some attention classes (e.g. BertAttention) no longer store + # `self.config` directly; fall back to searching child modules for a config attribute. + _config = getattr(self, "config", None) + if _config is None: + _config = next( + (getattr(m, "config", None) for m in self.children() if hasattr(m, "config")), + None, + ) + _attn_impl = getattr(_config, "_attn_implementation", None) if _config is not None else None def _is_eager_attention(): - if self.config._attn_implementation == "eager": + if _attn_impl is None or _attn_impl == "eager": return True - return bool( - self.config._attn_implementation == "sdpa" - and kwargs.get("output_attentions", False) - ) + return bool(_attn_impl == "sdpa" and kwargs.get("output_attentions", False)) # Get the original transformers module before wrapped in any ModelOpt DynamicModule module: ModuleType = inspect.getmodule(self.get_attn_type(self)) @@ -188,7 +194,7 @@ def _is_eager_attention(): original_attention_interface = ( module.eager_attention_forward if _is_eager_attention() - else module.ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + else module.ALL_ATTENTION_FUNCTIONS[_attn_impl] ) patch_fn = partial(self._quantized_attention, original_attention_interface) @@ -201,7 +207,7 @@ def _is_eager_attention(): ) module.eager_attention_forward = patch_fn # type: ignore[attr-defined] else: - module.ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] = patch_fn + module.ALL_ATTENTION_FUNCTIONS[_attn_impl] = patch_fn try: outputs = super().forward(*args, **kwargs) @@ -210,9 +216,7 @@ def _is_eager_attention(): if _is_eager_attention(): module.eager_attention_forward = original_attention_interface # type: ignore[attr-defined] else: - module.ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] = ( - original_attention_interface - ) + module.ALL_ATTENTION_FUNCTIONS[_attn_impl] = original_attention_interface return outputs @@ -333,10 +337,14 @@ class HFParallelLinear(torch.nn.Linear, DynamicModule): shard = None def _setup(self): - assert self.weight.placements == self.shard, ( - f"Received unexpected shard {self.weight.placements} for {self}" - ) - tp_group = self.weight.device_mesh.get_group() + if isinstance(self.weight, torch.distributed.tensor.DTensor): # transformers<5.0 + assert self.weight.placements == self.shard, ( + f"Received unexpected shard {self.weight.placements} for {self}" + ) + device_mesh = self.weight.device_mesh + else: # transformers>=5.0: weights are plain Parameters, mesh is on the module + device_mesh = self._hf_device_mesh + tp_group = device_mesh.get_group() self._parallel_state = ParallelState(data_parallel_group=-1, tensor_parallel_group=tp_group) @classmethod @@ -371,14 +379,17 @@ def fold_weight(self, keep_attrs: bool = False): @contextmanager def enable_weight_access_and_writeback(self): - assert self.weight.placements == self.shard, ( - f"Received unexpected shard {self.weight.placements} for {self}" - ) - weight = self.weight - # TODO: To support TP + FSDP, we need to redistribute the tensor with replicate instead of shard - self.weight = nn.Parameter(weight.to_local()) - yield - self.weight = weight + if isinstance(self.weight, torch.distributed.tensor.DTensor): # transformers<5.0 + assert self.weight.placements == self.shard, ( + f"Received unexpected shard {self.weight.placements} for {self}" + ) + weight = self.weight + # TODO: To support TP + FSDP, we need to redistribute the tensor with replicate instead of shard + self.weight = nn.Parameter(weight.to_local()) + yield + self.weight = weight + else: # transformers>=5.0: weights are already plain Parameters + yield @QuantModuleRegistry.register({HFColumnParallelLinear: "HFColumnParallelLinear"}) @@ -523,7 +534,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: super().forward(hidden_states) self.gate.top_k = original_top_k else: - # Path for transformers < 5.0 + # Path for transformers<5.0 if hasattr(self, "gate") and hasattr(self.gate, "top_k"): top_k_owner = self.gate else: @@ -596,22 +607,20 @@ def _setup(self): """Modify the DbrxExpert.""" # No setup is needed for DbrxExpert, we only need to update DbrxExpertGLU - # forward method copied from the original dbrx repo - https://github.com/databricks/dbrx/blob/a3200393/model/modeling_dbrx.py#L795 def forward( self, x: torch.Tensor, - weights: torch.Tensor, - top_weights: torch.Tensor, top_experts: torch.LongTensor, + top_weights: torch.Tensor, ) -> torch.Tensor: bsz, q_len, hidden_size = x.shape x = x.view(-1, hidden_size) out = torch.zeros_like(x) - expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute( + expert_mask = nn.functional.one_hot(top_experts, num_classes=self.num_experts).permute( 2, 1, 0 ) - for expert_idx in range(self.moe_num_experts): + for expert_idx in range(self.num_experts): topk_idx, token_idx = torch.where(expert_mask[expert_idx]) if token_idx.shape[0] == 0: continue @@ -641,41 +650,48 @@ def _copy_weights(modules, weights): with torch.no_grad(): module.weight.copy_(weights[expert_idx].detach()) + # In transformers 5.0, DbrxExpertGLU.forward uses raw matmul: x @ w1[i] where + # w1[i] has shape (ffn_hidden_size, hidden_size). To match via F.linear (which + # computes x @ W.T), we store weights transposed: W = w1[i].T. self.w1_linear = nn.ModuleList( [ - nn.Linear(self.hidden_size, self.ffn_hidden_size, bias=False) + nn.Linear(self.ffn_hidden_size, self.hidden_size, bias=False) for _ in range(self.moe_num_experts) ] ) _copy_weights( self.w1_linear, - self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size), + self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size).transpose( + 1, 2 + ), ) delattr(self, "w1") self.v1_linear = nn.ModuleList( [ - nn.Linear(self.hidden_size, self.ffn_hidden_size, bias=False) + nn.Linear(self.ffn_hidden_size, self.hidden_size, bias=False) for _ in range(self.moe_num_experts) ] ) _copy_weights( self.v1_linear, - self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size), + self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size).transpose( + 1, 2 + ), ) delattr(self, "v1") + # w2: down_proj uses intermediate.matmul(w2[i].t()) = F.linear(intermediate, w2[i]) + # so W = w2[i] directly (no extra transpose needed). self.w2_linear = nn.ModuleList( [ - nn.Linear(self.ffn_hidden_size, self.hidden_size, bias=False) + nn.Linear(self.hidden_size, self.ffn_hidden_size, bias=False) for _ in range(self.moe_num_experts) ] ) _copy_weights( self.w2_linear, - self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size).transpose( - 1, 2 - ), + self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size), ) delattr(self, "w2") @@ -875,11 +891,18 @@ def num_experts(self): @property def top_k(self): - return self.router.moe_top_k + # In older transformers, top_k was stored on DbrxRouter as moe_top_k. + # In transformers 5.0, DbrxFFN stores it as a plain attribute (top_k). + if hasattr(self.router, "moe_top_k"): + return self.router.moe_top_k + return self.__dict__.get("top_k", 1) @top_k.setter def top_k(self, value): - self.router.moe_top_k = value + if hasattr(self.router, "moe_top_k"): + self.router.moe_top_k = value + else: + self.__dict__["top_k"] = value @contextmanager @@ -902,10 +925,7 @@ def patch_compressed_linear_loading(): with patch_compressed_linear_loading(): model = AutoModelForCausalLM.from_pretrained( - ckpt_path, - device_map="auto", - trust_remote_code=True, - torch_dtype="auto", + ckpt_path, device_map="auto", trust_remote_code=True, dtype="auto" ) """ try: diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index b0d2786509..2536327843 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -29,7 +29,7 @@ import modelopt.torch.quantization as mtq from modelopt.torch.distill.plugins.huggingface import KDTrainer from modelopt.torch.opt.plugins import ModelOptHFTrainer -from modelopt.torch.utils import print_rank_0 +from modelopt.torch.utils import get_module_device, print_rank_0 from ..config import QuantizeConfig from ..nn import TensorQuantizer @@ -344,8 +344,10 @@ def _load_best_model(self, *args, **kwargs): ), "Some base_layer parameters are not frozen" adapter_name = self.model.active_adapters()[0] + device = get_module_device(self.model) self.model.delete_adapter(adapter_name) self.model.load_adapter(self.state.best_model_checkpoint, adapter_name) + self.model.to(device) else: super()._load_best_model(*args, **kwargs) diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py index 4340b8dc1f..22f2079649 100644 --- a/modelopt/torch/quantization/utils/core_utils.py +++ b/modelopt/torch/quantization/utils/core_utils.py @@ -524,7 +524,14 @@ def sync_moe_expert_amax(experts): 2. For any ``weight_quantizer`` that is enabled but has ``amax is None`` (expert received no tokens during calibration), runs a weight-only ``max_calibrate`` to populate the missing amax. + + No-op for batched expert modules (e.g. transformers>=5.0 ``Qwen3MoeExperts``) + that store all expert weights in a single 3D tensor without per-expert sub-modules. """ + if not hasattr(experts, "__iter__"): + # transformers>=5.0: batched experts, no per-expert quantizers + return + from ..nn import TensorQuantizer amax_dict: dict[str, torch.Tensor] = {} diff --git a/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py b/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py index 28c18943a2..a33938b057 100644 --- a/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py +++ b/modelopt/torch/sparsity/attention_sparsity/model_sparsify.py @@ -139,7 +139,7 @@ def forward_loop(model) -> float: model = AutoModelForCausalLM.from_pretrained( model_path, attn_implementation="eager", # Required for sparse attention - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, ) This is because sparse attention works by patching torch.nn.functional.softmax, diff --git a/modelopt/torch/speculative/eagle/default_config.py b/modelopt/torch/speculative/eagle/default_config.py index f8c4924c19..224823ad17 100644 --- a/modelopt/torch/speculative/eagle/default_config.py +++ b/modelopt/torch/speculative/eagle/default_config.py @@ -25,6 +25,7 @@ "high_freq_factor": 4.0, "original_max_position_embeddings": 8192, "rope_type": "llama3", + "rope_theta": 500000.0, }, "rope_theta": 500000.0, "num_hidden_layers": 1, diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py index 8561a390fc..b787a89063 100644 --- a/modelopt/torch/speculative/plugins/transformers.py +++ b/modelopt/torch/speculative/plugins/transformers.py @@ -75,11 +75,6 @@ CACHED_SHARD_TTT_MASKS = {} -def _get_empty_cache(config): - """Return an empty cache. Handle different versions of transformers for unit tests.""" - return DynamicCache(config=config) - - @MedusaDMRegistry.register({PreTrainedModel: "hf.PreTrainedModel"}) class HFMedusaModel(MedusaModel): """Medusa Model Class for huggingface models.""" @@ -287,9 +282,9 @@ def __init__(self, config, decoder_layer_cls, bias=False): num_layers=self.config.parallel_draft_heads_num_layers, ) - def _maybe_init_rope(self): + def _maybe_init_rope(self, device=None): if self.config.eagle_decoder_type == "llama" and not hasattr(self, "rotary_emb"): - self.rotary_emb = LlamaRotaryEmbedding(config=self.config) + self.rotary_emb = LlamaRotaryEmbedding(config=self.config, device=device) def _expand_first_attn_in_dim(self, first_layer_attn): """Modify qkv projection in first layer to accept 2h hidden size.""" @@ -565,12 +560,21 @@ def modify( elif self.eagle_decoder_type == "kimik2": decoder_cls = _setup_kimi_k2_decoder() - self.eagle_config = PretrainedConfig.from_dict(config.eagle_architecture_config) + arch_config = config.eagle_architecture_config + + # Populate base-model-dependent fields before constructing PretrainedConfig, + # since transformers >=5.4 validates rope_scaling during __init__. + arch_config.setdefault("hidden_size", self._base_llm_config.hidden_size) + arch_config.setdefault("vocab_size", self._base_llm_config.vocab_size) + arch_config.setdefault( + "max_position_embeddings", self._base_llm_config.max_position_embeddings + ) + rope_scaling = arch_config.get("rope_scaling") + if rope_scaling and "rope_theta" not in rope_scaling and "rope_theta" in arch_config: + rope_scaling["rope_theta"] = arch_config["rope_theta"] + + self.eagle_config = PretrainedConfig.from_dict(arch_config) self.eagle_config.eagle_decoder_type = self.eagle_decoder_type - # Hidden size and vocab size must match base model - self.eagle_config.hidden_size = self._base_llm_config.hidden_size - self.eagle_config.vocab_size = self._base_llm_config.vocab_size - self.eagle_config.max_position_embeddings = self._base_llm_config.max_position_embeddings self.eagle_config.draft_vocab_size = getattr( self.eagle_config, "draft_vocab_size", self.eagle_config.vocab_size ) @@ -751,7 +755,10 @@ def _compute_ttt_attention_mask( ) -> BlockMask | torch.Tensor: """Return TTT attention_mask tensor of type BlockMask or Tensor depends on eagle attn impl.""" msk_func = get_ttt_msk_func(seq_length, ttt_step) - dtypemin = torch.finfo(self._base_llm_config.dtype).min + dtype = ( + self._base_llm_config.dtype or self.eagle_module.layers[0].input_layernorm.weight.dtype + ) + dtypemin = torch.finfo(dtype).min q_len = seq_length kv_len = seq_length * (1 + ttt_step) if self.eagle_config._attn_implementation == "flex_attention": @@ -767,7 +774,7 @@ def _compute_ttt_attention_mask( torch.arange(kv_len).view(1, 1, 1, kv_len), ).to(self.device) tensor_mask = torch.full_like( - tensor_mask, 0, dtype=self._base_llm_config.dtype, device=self.device + tensor_mask, 0, dtype=dtype, device=self.device ).masked_fill(~tensor_mask, dtypemin) return tensor_mask @@ -910,9 +917,9 @@ def forward( ) if not isinstance(past_key_values, Cache): - past_key_values = _get_empty_cache(self._base_llm_config) + past_key_values = DynamicCache(config=self._base_llm_config) if not isinstance(eagle_cache, Cache): - eagle_cache = _get_empty_cache(self.eagle_module.config) + eagle_cache = DynamicCache(config=self.eagle_module.config) past_key_values.eagle_cache = eagle_cache # ====Prepare inputs for the first eagle forward pass==== @@ -937,7 +944,7 @@ def forward( base_outputs, ) - self.eagle_module._maybe_init_rope() + self.eagle_module._maybe_init_rope(device=input_ids.device) # ====Run eagle forward with extra training-time-test steps==== for ttt_step in range(self.eagle_ttt_steps): @@ -1070,7 +1077,7 @@ def pseudo_speculative_generate( else: eagle_input_hidden_states = base_model_hidden_states - self.eagle_module._maybe_init_rope() + self.eagle_module._maybe_init_rope(device=eagle_input_hidden_states.device) draft_tokens = [] for step in range(steps): b, seq_length = eagle_ids.shape diff --git a/modelopt/torch/speculative/utils.py b/modelopt/torch/speculative/utils.py index 9e167c8dc9..7bc6e2be0a 100644 --- a/modelopt/torch/speculative/utils.py +++ b/modelopt/torch/speculative/utils.py @@ -488,7 +488,7 @@ def load_vlm_or_llm( model_name_or_path: str, use_fake_base: bool = False, use_offline_training: bool = False, - torch_dtype: str | torch.dtype | None = None, + dtype: str | torch.dtype | None = None, device_map: str | None = None, trust_remote_code: bool = False, ): @@ -502,7 +502,7 @@ def load_vlm_or_llm( Args: model_name_or_path: Local path or HuggingFace repo ID of the model. use_offline_training: Whether to load a memory-efficient model for offline training. - torch_dtype: dtype to use when loading the model. + dtype: dtype to use when loading the model. device_map: Device map passed to ``from_pretrained``. trust_remote_code: Whether to trust remote code. """ @@ -528,7 +528,7 @@ def load_vlm_or_llm( model = model_cls.from_pretrained( model_name_or_path, trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, + dtype=dtype, device_map=device_map, **extra, ) diff --git a/modelopt/torch/trace/plugins/transformers.py b/modelopt/torch/trace/plugins/transformers.py index f07a37601b..ad7a8cf019 100644 --- a/modelopt/torch/trace/plugins/transformers.py +++ b/modelopt/torch/trace/plugins/transformers.py @@ -15,8 +15,11 @@ """Utilities to describe symbols in the dynamic attention module.""" +import torch +import transformers +from packaging.version import Version from torch import nn -from transformers.models.bert.modeling_bert import BertAttention +from transformers.models.bert.modeling_bert import BertAttention, BertLayer from transformers.models.gptj.modeling_gptj import GPTJAttention from ..symbols import Symbol, SymInfo, SymMap @@ -56,3 +59,57 @@ def get_hf_attn_sym_info_sortable(mod: nn.Module) -> SymInfo: @SymMap.register([GPTJAttention]) def get_hf_attn_sym_info_unsortable(mod: nn.Module) -> SymInfo: return get_hf_attn_sym_info(sortable_attn=True) + + +# In transformers>=5.0, BertLayer.forward uses tuple unpacking on the BertAttention output +# (e.g. `self_attn_out, _ = self.attention(...)`), which FX symbolic tracing cannot handle when +# BertAttention is a registered leaf (the proxy is not iterable). Patch BertLayer.forward to use +# indexing instead, and call feed_forward_chunk directly (equivalent to apply_chunking_to_forward +# with chunk_size=0, which is the default for BERT). +if Version(transformers.__version__) >= Version("5.0"): + + def _fx_friendly_bert_layer_forward( + self, + hidden_states: torch.Tensor, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + cache_position=None, + **kwargs, + ): + # Use indexing instead of tuple-unpacking so FX can trace through BertLayer + # when BertAttention is a registered leaf (returns an opaque Proxy). + # Accept **kwargs so that a parent trace (e.g. BertEncoder) passing extra kwargs + # like position_ids does not mark BertLayer as failed. However, do NOT forward + # **kwargs into self.attention: FX represents **kwargs as a Proxy(_kwargs), so + # unpacking it with ** would trigger "Proxy cannot be iterated". Additionally, + # BertSelfAttention ignores these kwargs (e.g. position_ids) in practice. + _attn_outputs = self.attention( + hidden_states, + attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + ) + attention_output = _attn_outputs[0] + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with" + " cross-attention layers by setting `config.add_cross_attention=True`" + ) + _cross_outputs = self.crossattention( + attention_output, + None, + encoder_hidden_states, + encoder_attention_mask, + past_key_values=past_key_values, + ) + attention_output = _cross_outputs[0] + + # Call feed_forward_chunk directly (equivalent to apply_chunking_to_forward when + # chunk_size_feed_forward=0, which is the BERT default). + return self.feed_forward_chunk(attention_output) + + BertLayer.forward = _fx_friendly_bert_layer_forward diff --git a/modelopt/torch/utils/speech_dataset_utils.py b/modelopt/torch/utils/speech_dataset_utils.py index a71d73773e..ef0660175e 100644 --- a/modelopt/torch/utils/speech_dataset_utils.py +++ b/modelopt/torch/utils/speech_dataset_utils.py @@ -48,9 +48,7 @@ def _get_speech_dataset(dataset_name: str, num_samples: int): # Use streaming can reduce the downloading time for large datasets dataset = load_dataset( - **SUPPORTED_SPEECH_DATASET_CONFIG[dataset_name]["config"], - trust_remote_code=True, - streaming=True, + **SUPPORTED_SPEECH_DATASET_CONFIG[dataset_name]["config"], streaming=True ) else: raise NotImplementedError( diff --git a/pyproject.toml b/pyproject.toml index 96490dff0a..4aa7ad86e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "nvidia-ml-py>=12", "packaging", "setuptools>=80", # torch.utils.cpp_extension imports setuptools at load time - "torch>=2.6", + "torch>=2.8", "tqdm", # modelopt.torch "PyYAML>=6.0", @@ -81,7 +81,8 @@ hf = [ "nltk", "peft>=0.17.0", "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export - "transformers>=4.56,<5.0", # Should match modelopt/torch/__init__.py and tox.ini + "tiktoken", + "transformers>=4.56.0", # Should match modelopt/torch/__init__.py and tox.ini "wonderwords", ] dev-lint = [ diff --git a/tests/_test_utils/examples/llm_ptq_utils.py b/tests/_test_utils/examples/llm_ptq_utils.py index 3bd7a39222..17a0764275 100644 --- a/tests/_test_utils/examples/llm_ptq_utils.py +++ b/tests/_test_utils/examples/llm_ptq_utils.py @@ -16,12 +16,10 @@ import importlib.metadata as metadata import subprocess from dataclasses import asdict, dataclass -from pathlib import Path import pytest import torch - -PTQ_EXAMPLE_DIR = Path(__file__).parents[3] / "examples" / "llm_ptq" +from _test_utils.examples.run_command import run_llm_ptq_command @dataclass @@ -32,6 +30,7 @@ class PTQCommand: sparsity: str | None = None kv_cache_quant: str | None = None trust_remote_code: bool = False + calib_dataset: str = "cnn_dailymail" calib_batch_size: int | None = None auto_quantize_bits: float | None = None tp: int | None = None @@ -47,37 +46,23 @@ def run(self, model_path: str): self.min_sm % 10, ): pytest.skip(reason=f"Requires sm{self.min_sm} or higher") - return if self.max_sm and torch.cuda.get_device_capability() > ( self.max_sm // 10, self.max_sm % 10, ): pytest.skip(reason=f"Requires sm{self.max_sm} or lower") - return if self.min_gpu and torch.cuda.device_count() < self.min_gpu: pytest.skip(reason=f"Requires at least {self.min_gpu} GPUs") - return param_dict = asdict(self) - param_dict.pop("min_sm", None) + param_dict.pop("max_sm", None) param_dict.pop("min_gpu", None) - trust_remote_code = param_dict.pop("trust_remote_code", False) - - args = ["--model", model_path] - for key, value in param_dict.items(): - if value is not None: - args.append(f"--{key}") - args.append(f"{value}") - - if trust_remote_code: - args.append("--trust_remote_code") - - self.command = ["scripts/huggingface_example.sh", "--no-verbose", *args] - subprocess.run(self.command, cwd=PTQ_EXAMPLE_DIR, check=True) + quant = param_dict.pop("quant") + run_llm_ptq_command(model=model_path, quant=quant, **param_dict) def param_str(self): param_dict = asdict(self) diff --git a/tests/_test_utils/examples/models.py b/tests/_test_utils/examples/models.py index abedd7b2a4..8bf2b95a60 100644 --- a/tests/_test_utils/examples/models.py +++ b/tests/_test_utils/examples/models.py @@ -64,8 +64,8 @@ def _select_path(remote_id: str, local_id: str) -> str: ) QWEN_VL_PATH = _select_path( - remote_id="Qwen/Qwen2-VL-2B-Instruct", - local_id="Qwen2-VL-2B-Instruct", + remote_id="Qwen/Qwen3-VL-2B-Instruct", + local_id="Qwen3-VL-2B-Instruct", ) # Diffusers diff --git a/tests/_test_utils/torch/quantization/tensor_quantizer_common.py b/tests/_test_utils/torch/quantization/tensor_quantizer_common.py index ad2722dca6..8559192718 100644 --- a/tests/_test_utils/torch/quantization/tensor_quantizer_common.py +++ b/tests/_test_utils/torch/quantization/tensor_quantizer_common.py @@ -144,10 +144,9 @@ def test_max_calib(self): rtol=0, ) - @pytest.mark.manual(reason="slow test, run with --run-manual") def test_entropy_and_percentile_calib(self): """Don't really have a good way to test it.""" - quant_attr_cfg1 = QuantizerAttributeConfig(calib_method="histogram") + quant_attr_cfg1 = QuantizerAttributeConfig(calibrator="histogram") quantizer1 = TensorQuantizer(quant_attr_cfg1, if_calib=True, if_quant=False).to(self.device) x_1 = torch.rand(3, 6, 7, 7).to(self.device) diff --git a/tests/_test_utils/torch/vision_models.py b/tests/_test_utils/torch/vision_models.py index 40e99c8d01..639dc16695 100644 --- a/tests/_test_utils/torch/vision_models.py +++ b/tests/_test_utils/torch/vision_models.py @@ -132,10 +132,10 @@ def get_model_and_input(on_gpu: bool = False): ], _create_torchvision_segmentation_fn, ), - "unet": ( - ["unet_carvana"], - _create_unet_fn, - ), + # "unet": ( + # ["unet_carvana"], + # _create_unet_fn, + # ), } diff --git a/tests/examples/llm_ptq/test_llm_ptq.py b/tests/examples/llm_ptq/test_llm_ptq.py index f5d0b39c1d..a5a470eea6 100644 --- a/tests/examples/llm_ptq/test_llm_ptq.py +++ b/tests/examples/llm_ptq/test_llm_ptq.py @@ -12,10 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - import pytest -from _test_utils.examples.llm_ptq_utils import PTQCommand, WithRequirements +import transformers +from _test_utils.examples.llm_ptq_utils import PTQCommand from _test_utils.examples.models import ( BART_PATH, MIXTRAL_PATH, @@ -23,6 +22,7 @@ TINY_LLAMA_PATH, WHISPER_PATH, ) +from packaging.version import Version @pytest.mark.parametrize( @@ -36,18 +36,9 @@ def test_ptq_bart(command): command.run(BART_PATH) -class TestT5(WithRequirements): - requirements = [("transformers", "4.48.0")] - - @pytest.mark.parametrize( - "command", - [ - PTQCommand(quant="fp8", min_sm=89), - ], - ids=PTQCommand.param_str, - ) - def test_ptq_t5(self, command): - command.run(T5_PATH) +@pytest.mark.parametrize("command", [PTQCommand(quant="fp8", min_sm=89)], ids=PTQCommand.param_str) +def test_ptq_t5(command): + command.run(T5_PATH) @pytest.mark.parametrize( @@ -61,22 +52,20 @@ def test_ptq_mixtral(command): command.run(MIXTRAL_PATH) -class TestWhisper(WithRequirements): - requirements = [ - ("librosa", None), - ("soundfile", None), - ] - - @pytest.mark.parametrize( - "command", - [ - # Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size - PTQCommand(quant="fp8", calib_batch_size=16, min_sm=89), - ], - ids=PTQCommand.param_str, - ) - def test_ptq_whisper(self, command): - command.run(WHISPER_PATH) +@pytest.mark.skipif( + Version(transformers.__version__) >= Version("5.0"), + reason="Whisper requires torchcodec and other system packages for transformers>=5.0", +) +@pytest.mark.parametrize( + "command", + [ + # Auto-batch-size computation seems to take >10mins for Whisper hence using a fixed batch size + PTQCommand(quant="fp8", calib_batch_size=16, calib_dataset="peoples_speech", min_sm=89), + ], + ids=PTQCommand.param_str, +) +def test_ptq_whisper(command): + command.run(WHISPER_PATH) @pytest.mark.parametrize( diff --git a/tests/examples/llm_qat/test_llm_qat.py b/tests/examples/llm_qat/test_llm_qat.py index ebdb670247..5a0e7ad442 100644 --- a/tests/examples/llm_qat/test_llm_qat.py +++ b/tests/examples/llm_qat/test_llm_qat.py @@ -17,6 +17,7 @@ import pytest import torch from _test_utils.examples.run_command import run_example_command +from _test_utils.torch.misc import minimum_sm # fmt: off @@ -98,7 +99,7 @@ def test_llama_lora_qat_nvfp4(tiny_llama_path, tmp_path): ] ) - +@minimum_sm(90) def test_llama_qlora_nvfp4(tiny_llama_path, tmp_path): _run_command( [ diff --git a/tests/examples/speculative_decoding/conftest.py b/tests/examples/speculative_decoding/conftest.py index 80417f4048..34ab4e4741 100644 --- a/tests/examples/speculative_decoding/conftest.py +++ b/tests/examples/speculative_decoding/conftest.py @@ -13,30 +13,31 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pytest -from _test_utils.examples.run_command import MODELOPT_ROOT, run_example_command +import yaml +from _test_utils.examples.run_command import run_example_command @pytest.fixture(scope="session", autouse=True) def tiny_daring_anteater_path(tmp_path_factory): - dataset_path = ( - MODELOPT_ROOT / "examples/speculative_decoding/input_conversations/daring-anteater.jsonl" + tmp_dir = tmp_path_factory.mktemp("daring_anteater") + output_file = tmp_dir / "train.jsonl" + + config = { + "outputs": [ + { + "filename": str(output_file), + "global_limit": 100, + "sources": [{"name": "daring-anteater", "splits": {"all": 100}}], + } + ] + } + config_path = tmp_dir / "data_config.yaml" + config_path.write_text(yaml.dump(config)) + + run_example_command( + ["python", "prepare_input_conversations/make_dataset.py", "-f", str(config_path), "--full"], + "speculative_decoding", ) - if not os.path.exists(dataset_path): - try: - run_example_command( - ["python", "prepare_input_conversations/add_daring_anteater.py"], - "speculative_decoding", - ) - except Exception as e: - # Ignore rate-limiting errors - pytest.skip(f"Failed to prepare dataset: {e}") - output_path = tmp_path_factory.mktemp("daring_anteater") / "train.jsonl" - with open(dataset_path) as src, open(output_path, "w") as dst: - for i, line in enumerate(src): - if i >= 128: - break - dst.write(line) - return output_path + + return output_file diff --git a/tests/examples/speculative_decoding/test_eagle.py b/tests/examples/speculative_decoding/test_eagle.py index 271241bcb0..ca094bc6e2 100644 --- a/tests/examples/speculative_decoding/test_eagle.py +++ b/tests/examples/speculative_decoding/test_eagle.py @@ -22,6 +22,7 @@ import torch from _test_utils.examples.run_command import run_example_command from packaging.version import Version +from transformers import AutoConfig from modelopt.torch.export.plugins.hf_spec_export import LLAMA_EAGLE_SINGLE_LAYER @@ -105,11 +106,11 @@ def test_llama_eagle3(tiny_llama_path, tiny_daring_anteater_path, tmp_path, eagle_output_dir, cp_size, - mix_hidden_states): + mix_hidden_states, + num_gpus): """Test Eagle3 training with a tiny llama model, using different cp_size values.""" - available_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 - if cp_size == 2 and available_gpus < 2: - pytest.skip("cp_size=2 requires at least 2 GPUs, but only {} found.".format(available_gpus)) + if cp_size == 2 and num_gpus < 2: + pytest.skip("cp_size=2 requires at least 2 GPUs, but only {} found.".format(num_gpus)) if cp_size == 2 and not Version(torch.__version__) >= Version("2.10.0"): pytest.skip("cp_size=2 requires torch 2.10.0") # Create an ultra-tiny EAGLE config for testing to reduce memory usage @@ -210,8 +211,14 @@ def test_convert_to_vllm_ckpt(tiny_llama_path, eagle_output_dir): [ (None, False), # tiny_llama (from fixture), no FakeBase ("moonshotai/Kimi-K2.5", True), # remote HF repo, FakeBaseModel - ("moonshotai/Kimi-K2-Thinking", True), # remote HF repo, no FakeBaseModel - ("MiniMaxAI/MiniMax-M2.5", True), + pytest.param( + "moonshotai/Kimi-K2-Thinking", True, # remote HF repo, no FakeBaseModel + marks=pytest.mark.manual(reason="skip redundand test, too slow"), + ), + pytest.param( + "MiniMaxAI/MiniMax-M2.5", True, + marks=pytest.mark.manual(reason="skip redundand test, too slow"), + ), ], ids=["tinyllama", "kimi-k2.5","kimi-k2-thinking","minimax-m2.5"], ) @@ -220,16 +227,12 @@ def test_offline_eagle3_training( model_source, use_fake_base, ): """Test Eagle3 training with pre-computed hidden states (offline mode / FakeBaseModel).""" - import transformers - model_path = tiny_llama_path if model_source is None else model_source model_id = "tinyllama" if model_source is None else model_source.split("/")[-1] output_subdir = eagle_output_dir / f"eagle-{model_id}-offline" - cfg = transformers.AutoConfig.from_pretrained(model_path, trust_remote_code=True) - - if model_source=="moonshotai/Kimi-K2.5": - #vlm, get text config + cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + if hasattr(cfg, "text_config"): # vlm: get text_config cfg = cfg.text_config offline_data_dir = generate_offline_pt_data( @@ -277,10 +280,8 @@ def test_offline_resume_training_kimi(tiny_daring_anteater_path, tmp_path, eagle Depends on test_offline_eagle3_training["kimi-k2.5"] having run first. Exercises AutoModelForCausalLM.from_pretrained with model_type='fake_base_model'. """ - import transformers - checkpoint_dir = eagle_output_dir / "eagle-Kimi-K2.5-offline" - config = transformers.AutoConfig.from_pretrained(checkpoint_dir, trust_remote_code=True) + config = AutoConfig.from_pretrained(checkpoint_dir, trust_remote_code=True) offline_data_dir = generate_offline_pt_data( tmp_path / "offline_data_resume", diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py index 6b971c3251..6a27ece72f 100644 --- a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py +++ b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py @@ -80,6 +80,7 @@ def test_unified_hf_export_and_check_safetensors( pyt_ckpt_path=tiny_model_dir, qformat=qformat, export_path=output_dir, + dataset="cnn_dailymail", ) # Run the command diff --git a/tests/gpu/torch/quantization/test_gptq.py b/tests/gpu/torch/quantization/test_gptq.py index 0c60bcd007..d43177cae2 100644 --- a/tests/gpu/torch/quantization/test_gptq.py +++ b/tests/gpu/torch/quantization/test_gptq.py @@ -163,9 +163,7 @@ def test_gptq_e2e_flow(quant_cfg): model = AutoModelForCausalLM.from_pretrained( "TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto" ) - tokenizer = AutoTokenizer.from_pretrained( - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", trust_remote_code=True - ) + tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") # can't set attribute 'pad_token' for "" # We skip this step for Nemo models diff --git a/tests/unit/onnx/quantization/test_quantize_api.py b/tests/unit/onnx/quantization/test_quantize_api.py index 3ce8f2f7fe..464fb1a88b 100644 --- a/tests/unit/onnx/quantization/test_quantize_api.py +++ b/tests/unit/onnx/quantization/test_quantize_api.py @@ -36,7 +36,6 @@ # onnxruntime version that supports opset 22+ ORT_VERSION_FOR_OPSET_22 = version.parse("1.23.0") -TORCH_VERSION_FOR_OPSET_22 = version.parse("2.8.0") # Test scenarios: (scenario_name, export_opset_offset, request_opset_offset, expected_opset_offset) @@ -87,11 +86,6 @@ def test_quantize_opset_handling( pytest.skip( f"Opset {max_opset} requires onnxruntime >= {ORT_VERSION_FOR_OPSET_22}, have {ort_version}" ) - torch_version = version.parse(torch.__version__) - if torch_version < TORCH_VERSION_FOR_OPSET_22: - pytest.skip( - f"Opset {max_opset} requires torch >= {TORCH_VERSION_FOR_OPSET_22}, have {torch_version}" - ) # Setup: create and export model model_torch = SimpleMLP() diff --git a/tests/unit/torch/opt/plugins/test_transformers_save_load.py b/tests/unit/torch/opt/plugins/test_transformers_save_load.py index 25b182b9bd..fced5734e4 100644 --- a/tests/unit/torch/opt/plugins/test_transformers_save_load.py +++ b/tests/unit/torch/opt/plugins/test_transformers_save_load.py @@ -17,6 +17,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import pytest +import torch from _test_utils.torch.opt.utils import apply_mode_with_sampling from _test_utils.torch.transformers_models import ( create_tiny_llama_dir, @@ -27,7 +28,7 @@ @pytest.mark.parametrize("model_cls", [LlamaForCausalLM, AutoModelForCausalLM]) def test_causal_lm_save_restore(tmp_path, model_cls): - tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128) + tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128, dtype=torch.float32) model_ref = model_cls.from_pretrained(tiny_llama_dir) # TODO: Add calibrate, compress mode to the test model_ref = apply_mode_with_sampling( @@ -41,7 +42,7 @@ def test_causal_lm_save_restore(tmp_path, model_cls): def test_causal_lm_from_config(tmp_path): """Test loading a model using from_config after applying optimizations""" - tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128) + tiny_llama_dir = create_tiny_llama_dir(tmp_path, hidden_size=128, dtype=torch.float32) model_ref = AutoModelForCausalLM.from_pretrained(tiny_llama_dir) model_ref = apply_mode_with_sampling( diff --git a/tests/unit/torch/quantization/plugins/test_huggingface.py b/tests/unit/torch/quantization/plugins/test_huggingface.py index 33730409a6..8c66e6651c 100644 --- a/tests/unit/torch/quantization/plugins/test_huggingface.py +++ b/tests/unit/torch/quantization/plugins/test_huggingface.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import os import warnings from contextlib import nullcontext @@ -28,6 +27,7 @@ get_tiny_qwen3_moe, tf_modelopt_state_and_output_tester, ) +from packaging.version import Version import modelopt.torch.quantization as mtq from modelopt.torch.quantization.nn import QuantLinear, QuantModuleRegistry @@ -105,12 +105,17 @@ def test_convert_conv1d(): assert torch.allclose(out_1, out_2) +@pytest.mark.skipif( + Version(transformers.__version__) < Version("5.0"), + reason="test_dbrx is not supported for transformers<5.0", +) def test_dbrx(): assert DbrxExperts in QuantModuleRegistry assert DbrxExpertGLU in QuantModuleRegistry config = DbrxConfig( - ffn_config=DbrxFFNConfig(ffn_hidden_size=8, moe_num_experts=2), hidden_size=32 + ffn_config=DbrxFFNConfig(ffn_hidden_size=8, moe_num_experts=2, hidden_size=32), + hidden_size=32, ) model_ref = DbrxFFN(config) @@ -131,14 +136,17 @@ def test_dbrx(): assert hasattr(expertglu_test, "v1_linear") and not hasattr(expertglu_test, "v1") assert hasattr(expertglu_test, "w2_linear") and not hasattr(expertglu_test, "w2") + # Weights are stored transposed (W = w1[i].T) to match F.linear semantics with + # transformers 5.0's raw matmul: x @ w1[i] = F.linear(x, w1[i].T) assert torch.allclose( - torch.concat(list(expertglu_test.w1_linear.parameters()), dim=0), + torch.concat([m.weight.T for m in expertglu_test.w1_linear], dim=0), expertglu_ref.w1, ) mtq.set_quantizer_attribute(model_test, "*", {"enable": False}) - x = torch.randn(1, 4, 32) + # In transformers 5.0, the FFN input dimension is ffn_hidden_size (not hidden_size) + x = torch.randn(1, 4, 8) out_1 = model_ref(x) out_2 = model_test(x) assert torch.allclose(out_1[0], out_2[0]) @@ -147,6 +155,9 @@ def test_dbrx(): @pytest.mark.parametrize("method", ["gradient", "kl_div"]) @pytest.mark.parametrize("model_provider", [get_tiny_llama, get_tiny_qwen3_moe]) def test_autoquantize_huggingface(model_provider, method): + if model_provider == get_tiny_qwen3_moe and Version(torch.__version__) < Version("2.9"): + pytest.skip("torch 2.8 grouped_mm is CUDA-only") + model = model_provider() input_ids = model.dummy_inputs["input_ids"] @@ -190,7 +201,7 @@ def forward_step(model, batch): ], ) def test_quantized_transformers_save_restore(tmp_path, model_cls, quant_config): - tiny_llama_dir = create_tiny_llama_dir(tmp_path) + tiny_llama_dir = create_tiny_llama_dir(tmp_path, dtype=torch.float32) # update config to fit test cases if quant_config == mtq.INT4_AWQ_CFG: quant_config["quant_cfg"]["*weight_quantizer"]["block_sizes"] = {-1: 16} diff --git a/tests/unit/torch/quantization/plugins/test_peft.py b/tests/unit/torch/quantization/plugins/test_peft.py index c794c67bc2..310a744c70 100644 --- a/tests/unit/torch/quantization/plugins/test_peft.py +++ b/tests/unit/torch/quantization/plugins/test_peft.py @@ -16,6 +16,7 @@ import pytest import torch from _test_utils.torch.transformers_models import get_tiny_gpt_oss, get_tiny_llama, tf_output_tester +from packaging.version import Version pytest.importorskip("peft") transformers = pytest.importorskip("transformers") @@ -53,6 +54,9 @@ def test_convert_loralinear(): tf_output_tester(model_ref, model_test) +@pytest.mark.skipif( + Version(torch.__version__) < Version("2.9"), reason="torch 2.8 grouped_mm is CUDA-only" +) def test_peft_flow(tmp_path): model_original = get_tiny_gpt_oss(num_hidden_layers=1) diff --git a/tests/unit/torch/quantization/plugins/test_sparse_moe.py b/tests/unit/torch/quantization/plugins/test_sparse_moe.py index 4ef428e9bb..3e8baab798 100644 --- a/tests/unit/torch/quantization/plugins/test_sparse_moe.py +++ b/tests/unit/torch/quantization/plugins/test_sparse_moe.py @@ -20,9 +20,13 @@ import pytest import torch import torch.nn as nn +from packaging.version import Version pytest.importorskip("transformers") +if Version(torch.__version__) < Version("2.9"): + pytest.skip("torch 2.8 grouped_mm is CUDA-only", allow_module_level=True) + from _test_utils.torch.transformers_models import get_tiny_qwen3_moe import modelopt.torch.quantization as mtq diff --git a/tests/unit/torch/quantization/test_calibrator.py b/tests/unit/torch/quantization/test_calibrator.py index 4cb7458912..19c86b0b9f 100644 --- a/tests/unit/torch/quantization/test_calibrator.py +++ b/tests/unit/torch/quantization/test_calibrator.py @@ -88,8 +88,8 @@ def test_track_amax_raises(self): max_calibrator.collect(x_3) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestHistogramCalibrator: + @pytest.mark.skip(reason="TODO: Fix assertions in test_grow") def test_grow(self, verbose): x_1 = torch.tensor([0, 255, 255, 255, 255, 255]) x_2 = torch.tensor([0, 255, 255, 255, 255, 256]) @@ -181,7 +181,6 @@ def test_torch_hist(self): ) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestEntropyCalibrator: def test_one_tensor(self, verbose): hist_calibrator = calib.HistogramCalibrator( @@ -244,7 +243,6 @@ def test_repr(self): repr(hist_calibrator) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestMSECalibrator: def test_one_tensor(self, verbose): calibrator = calib.HistogramCalibrator(8, None, False, num_bins=32) @@ -299,7 +297,6 @@ def test_repr(self): repr(calibrator) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestPercentileCalibrator: def test_one_tensor(self, verbose): calibrator = calib.HistogramCalibrator(8, None, False) @@ -359,7 +356,6 @@ def test_range(self): calibrator.compute_amax("percentile", percentile=200) -@pytest.mark.manual(reason="slow test, run with --run-manual") class TestCalibrateWeights: def test_max(self): ref_lenet = QuantConvLinear() diff --git a/tox.ini b/tox.ini index 80299d814d..8b7022d074 100644 --- a/tox.ini +++ b/tox.ini @@ -12,14 +12,13 @@ passenv = ############################ # CPU Unit test environments ############################ -[testenv:{py310,py311,py312,py313}-torch{26,27,28,29,210}-tf_{min,latest}-unit] +[testenv:{py310,py311,py312,py313}-torch{28,29,210,211}-tf_{min,latest}-unit] deps = # torch version auto-selected based on torchvision version - torch26: torchvision~=0.21.0 - torch27: torchvision~=0.22.0 torch28: torchvision~=0.23.0 torch29: torchvision~=0.24.0 torch210: torchvision~=0.25.0 + torch211: torchvision~=0.26.0 -e .[all,dev-test] @@ -37,7 +36,7 @@ allowlist_externals = bash, rm deps = # Make sure torch 2.10 is used - torchvision~=0.25.0 + torchvision~=0.26.0 # ONNX unit tests heavily rely on torch / torchvision onnx: .[onnx,dev-test]