From 66c7934e124a3d692fbf9c6acda8adc9c74e599c Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Fri, 1 May 2026 08:50:40 +0300 Subject: [PATCH 1/8] transformers==5.0.0rc3 --- examples/llm_compression/onnx/tiny_llama/requirements.txt | 2 +- .../onnx/tiny_llama_scale_estimation/requirements.txt | 2 +- .../openvino/smollm2_360m_adaptive_codebook/requirements.txt | 2 +- .../openvino/smollm2_360m_codebook/requirements.txt | 2 +- .../llm_compression/openvino/smollm2_360m_fp8/requirements.txt | 2 +- examples/llm_compression/openvino/tiny_llama/requirements.txt | 2 +- .../openvino/tiny_llama_find_hyperparams/requirements.txt | 2 +- .../openvino/tiny_llama_synthetic_data/requirements.txt | 2 +- .../torch/distillation_qat_with_lora/requirements.txt | 2 +- .../torch/downstream_qat_with_nls/requirements.txt | 2 +- examples/llm_compression/torch_fx/tiny_llama/requirements.txt | 2 +- tests/openvino/requirements.txt | 2 +- tests/post_training/requirements.txt | 2 +- tests/torch/requirements.txt | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/llm_compression/onnx/tiny_llama/requirements.txt b/examples/llm_compression/onnx/tiny_llama/requirements.txt index ce7f1bb48f3..8a638996e0b 100644 --- a/examples/llm_compression/onnx/tiny_llama/requirements.txt +++ b/examples/llm_compression/onnx/tiny_llama/requirements.txt @@ -1,4 +1,4 @@ -transformers==4.53.0 +transformers==5.0.0rc3 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 diff --git a/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt b/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt index 7cb1d588fe2..afba286677b 100644 --- a/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt +++ b/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt @@ -1,5 +1,5 @@ torch==2.10.0 -transformers==4.53.0 +transformers==5.0.0rc3 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 diff --git a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt index 562ccc21cc6..b7687ba3021 100644 --- a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt +++ b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt @@ -2,7 +2,7 @@ datasets==5.0.0 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0rc3 onnx==1.21.0 torch==2.10.0 torchvision==0.25.0 diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt index 8d98d5d55ab..cefabe36a41 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt @@ -1,7 +1,7 @@ openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0rc3 onnx==1.21.0 torch==2.10.0 torchvision==0.25.0 diff --git a/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt index d91c5d1957e..6de61fb412e 100644 --- a/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt +++ b/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt @@ -2,6 +2,6 @@ datasets==5.0.0 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0rc3 onnx==1.21.0 torch==2.10.0 diff --git a/examples/llm_compression/openvino/tiny_llama/requirements.txt b/examples/llm_compression/openvino/tiny_llama/requirements.txt index 86ee222e05f..74478f2302f 100644 --- a/examples/llm_compression/openvino/tiny_llama/requirements.txt +++ b/examples/llm_compression/openvino/tiny_llama/requirements.txt @@ -4,4 +4,4 @@ openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 torch==2.10.0 -transformers==4.53.0 +transformers==5.0.0rc3 diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt index 7648960d565..277f2be63f9 100644 --- a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt +++ b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt @@ -3,6 +3,6 @@ numpy==1.26.4 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0rc3 onnx==1.21.0 torch==2.10.0 diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt b/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt index 285d8ce4ced..1005d12956f 100644 --- a/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt +++ b/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt @@ -4,5 +4,5 @@ numpy>=1.23.5,<2 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0rc3 onnx==1.21.0 diff --git a/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt b/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt index 1fb91a4647a..fbc8178f187 100644 --- a/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt +++ b/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt @@ -5,5 +5,5 @@ numpy>=1.23.5,<2 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0rc3 lm_eval==0.4.8 diff --git a/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt b/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt index fc3a0ac02f6..d77d9788efa 100644 --- a/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt +++ b/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt @@ -4,6 +4,6 @@ numpy>=1.23.5,<2 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0rc3 lm_eval==0.4.8 torchao==0.17.0 diff --git a/examples/llm_compression/torch_fx/tiny_llama/requirements.txt b/examples/llm_compression/torch_fx/tiny_llama/requirements.txt index 80ce17a9ebe..d196e50a8a5 100644 --- a/examples/llm_compression/torch_fx/tiny_llama/requirements.txt +++ b/examples/llm_compression/torch_fx/tiny_llama/requirements.txt @@ -1,4 +1,4 @@ -transformers==4.53.0 +transformers==5.0.0rc3 datasets==5.0.0 openvino==2026.2.0 optimum==2.2.0 diff --git a/tests/openvino/requirements.txt b/tests/openvino/requirements.txt index 47e3781a108..e76d7f4f39f 100644 --- a/tests/openvino/requirements.txt +++ b/tests/openvino/requirements.txt @@ -15,6 +15,6 @@ addict>=2.4.0 timm==0.9.2 efficientnet_pytorch==0.7.1 datasets -transformers==4.53.0 +transformers==5.0.0rc3 optimum-intel==2.0.0 optimum==2.2.0 diff --git a/tests/post_training/requirements.txt b/tests/post_training/requirements.txt index a409d23282c..5a63643bbb4 100644 --- a/tests/post_training/requirements.txt +++ b/tests/post_training/requirements.txt @@ -25,5 +25,5 @@ tensorboard==2.20.0 tensorflow-io==0.37.0 timm==0.9.2 accelerate==1.9.0 -transformers==4.53.0 +transformers==5.0.0rc3 whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@light_req#subdirectory=tools/who_what_benchmark diff --git a/tests/torch/requirements.txt b/tests/torch/requirements.txt index f3057e556e9..11ec2b7cb48 100644 --- a/tests/torch/requirements.txt +++ b/tests/torch/requirements.txt @@ -14,7 +14,7 @@ torchvision addict>=2.4.0 efficientnet_pytorch==0.7.1 -transformers==4.53.0 +transformers==5.0.0rc3 sentence-transformers==4.1.0 optimum-intel==2.0.0 From 5d3d5d854d38dfe2c572fcd133f5103a1654dd15 Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Tue, 16 Jun 2026 17:15:41 +0300 Subject: [PATCH 2/8] sentence-transformers==5.6.0 --- tests/torch/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/torch/requirements.txt b/tests/torch/requirements.txt index 11ec2b7cb48..2d14049c84e 100644 --- a/tests/torch/requirements.txt +++ b/tests/torch/requirements.txt @@ -16,7 +16,7 @@ addict>=2.4.0 efficientnet_pytorch==0.7.1 transformers==5.0.0rc3 -sentence-transformers==4.1.0 +sentence-transformers==5.6.0 optimum-intel==2.0.0 optimum==2.2.0 accelerate==1.9.0 From 36acba96c0341d9976a8819a499377d9076bfb52 Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Wed, 17 Jun 2026 10:51:55 +0300 Subject: [PATCH 3/8] f --- .../llm_compression/onnx/tiny_llama/main.py | 6 ++++++ .../onnx/tiny_llama_scale_estimation/main.py | 7 +++++++ .../smollm2_360m_adaptive_codebook/main.py | 6 ++++++ .../openvino/smollm2_360m_codebook/main.py | 6 ++++++ .../openvino/tiny_llama/main.py | 6 ++++++ .../tiny_llama_find_hyperparams/main.py | 6 ++++++ .../tiny_llama_synthetic_data/main.py | 6 ++++++ .../requirements.txt | 2 +- .../downstream_qat_with_nls/requirements.txt | 2 +- ..._weights_compression_statistics_caching.py | 6 ++++++ tests/post_training/requirements.txt | 2 +- .../sparsify_activations/helpers.py | 16 ---------------- .../sparsify_activations/test_algo.py | 19 ------------------- 13 files changed, 52 insertions(+), 38 deletions(-) diff --git a/examples/llm_compression/onnx/tiny_llama/main.py b/examples/llm_compression/onnx/tiny_llama/main.py index 90f1b475ac6..e90fe651f28 100644 --- a/examples/llm_compression/onnx/tiny_llama/main.py +++ b/examples/llm_compression/onnx/tiny_llama/main.py @@ -14,6 +14,10 @@ from pathlib import Path import onnx + +# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel +# WA for https://github.com/huggingface/optimum-intel/pull/1798 +import optimum.intel.utils.import_utils as import_utils from optimum.intel.openvino import OVModelForCausalLM from optimum.onnxruntime import ORTModelForCausalLM from transformers import AutoTokenizer @@ -21,6 +25,8 @@ import nncf from nncf.onnx.quantization.backend_parameters import BackendParameters +import_utils._transformers_version = "5.0.0" + ROOT = Path(__file__).parent.resolve() diff --git a/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py b/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py index 818c8610248..6e51b0dcb06 100644 --- a/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py +++ b/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py @@ -16,6 +16,10 @@ import numpy as np import onnx + +# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel +# WA for https://github.com/huggingface/optimum-intel/pull/1798 +import optimum.intel.utils.import_utils as import_utils import torch from datasets import load_dataset from optimum.intel.openvino import OVModelForCausalLM @@ -27,6 +31,9 @@ import nncf from nncf.onnx.quantization.backend_parameters import BackendParameters +import_utils._transformers_version = "5.0.0" + + ROOT = Path(__file__).parent.resolve() MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" OUTPUT_DIR = ROOT / "tinyllama_compressed" diff --git a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py index 8975609e00d..780828c2680 100644 --- a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py @@ -14,6 +14,10 @@ import datasets import numpy as np + +# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel +# WA for https://github.com/huggingface/optimum-intel/pull/1798 +import optimum.intel.utils.import_utils as import_utils from optimum.intel.openvino import OVModelForCausalLM from scipy.stats import norm from torch.jit import TracerWarning @@ -23,6 +27,8 @@ import nncf from nncf.quantization.advanced_parameters import AdvancedAdaptiveCodebookParameters +import_utils._transformers_version = "5.0.0" + logging.set_verbosity_error() warnings.filterwarnings("ignore", category=TracerWarning) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 9711304e9dc..849dc22d9e2 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -12,6 +12,10 @@ import warnings import numpy as np + +# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel +# WA for https://github.com/huggingface/optimum-intel/pull/1798 +import optimum.intel.utils.import_utils as import_utils from optimum.intel.openvino import OVModelForCausalLM from torch.jit import TracerWarning from transformers import AutoTokenizer @@ -19,6 +23,8 @@ import nncf +import_utils._transformers_version = "5.0.0" + logging.set_verbosity_error() warnings.filterwarnings("ignore", category=TracerWarning) diff --git a/examples/llm_compression/openvino/tiny_llama/main.py b/examples/llm_compression/openvino/tiny_llama/main.py index 14224dba5d4..be4736ca242 100644 --- a/examples/llm_compression/openvino/tiny_llama/main.py +++ b/examples/llm_compression/openvino/tiny_llama/main.py @@ -12,12 +12,18 @@ from functools import partial import numpy as np + +# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel +# WA for https://github.com/huggingface/optimum-intel/pull/1798 +import optimum.intel.utils.import_utils as import_utils from datasets import load_dataset from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer import nncf +import_utils._transformers_version = "5.0.0" + def main(): MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py index 7705073bbe9..ff55b1b1530 100644 --- a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py +++ b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py @@ -17,6 +17,10 @@ import numpy as np import openvino as ov + +# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel +# WA for https://github.com/huggingface/optimum-intel/pull/1798 +import optimum.intel.utils.import_utils as import_utils from datasets import load_dataset from optimum.intel import OVModelForCausalLM from transformers import AutoTokenizer @@ -26,6 +30,8 @@ from nncf.common.logging import nncf_logger from nncf.quantization.advanced_parameters import AdvancedCompressionParameters +import_utils._transformers_version = "5.0.0" + DataItem = TypeVar("DataItem") ModelInput = TypeVar("ModelInput") diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py index 25faa00ee28..3f22faafae4 100644 --- a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py +++ b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py @@ -12,12 +12,18 @@ from functools import partial import numpy as np + +# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel +# WA for https://github.com/huggingface/optimum-intel/pull/1798 +import optimum.intel.utils.import_utils as import_utils import torch from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer import nncf +import_utils._transformers_version = "5.0.0" + SEED = 0 diff --git a/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt b/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt index fbc8178f187..2d38b27eb99 100644 --- a/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt +++ b/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt @@ -6,4 +6,4 @@ openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 transformers==5.0.0rc3 -lm_eval==0.4.8 +lm_eval==0.4.12 diff --git a/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt b/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt index d77d9788efa..f79199198dd 100644 --- a/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt +++ b/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt @@ -5,5 +5,5 @@ openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 transformers==5.0.0rc3 -lm_eval==0.4.8 +lm_eval==0.4.12 torchao==0.17.0 diff --git a/tests/openvino/native/quantization/test_weights_compression_statistics_caching.py b/tests/openvino/native/quantization/test_weights_compression_statistics_caching.py index 45264e71c1d..f8043d906db 100644 --- a/tests/openvino/native/quantization/test_weights_compression_statistics_caching.py +++ b/tests/openvino/native/quantization/test_weights_compression_statistics_caching.py @@ -15,6 +15,10 @@ import datasets import numpy as np import openvino as ov + +# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel +# WA for https://github.com/huggingface/optimum-intel/pull/1798 +import optimum.intel.utils.import_utils as import_utils from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer @@ -22,6 +26,8 @@ from nncf.quantization.advanced_parameters import AdvancedCompressionParameters from nncf.scopes import IgnoredScope +import_utils._transformers_version = "5.0.0" + MODEL_ID = "hf-internal-testing/tiny-random-OPTForCausalLM" DEFAULT_RATIO = 0.4 DEFAULT_GROUP_SIZE = 4 diff --git a/tests/post_training/requirements.txt b/tests/post_training/requirements.txt index 5a63643bbb4..9d0ad5426a6 100644 --- a/tests/post_training/requirements.txt +++ b/tests/post_training/requirements.txt @@ -22,7 +22,7 @@ optimum-onnx @ git+https://github.com/AlexanderDokuchaev/optimum-onnx.git@b57739 scikit-learn>=1.2.2,<=1.5.0 soundfile==0.12.1 tensorboard==2.20.0 -tensorflow-io==0.37.0 +tensorflow-io==0.37.1 timm==0.9.2 accelerate==1.9.0 transformers==5.0.0rc3 diff --git a/tests/torch/function_hook/sparsify_activations/helpers.py b/tests/torch/function_hook/sparsify_activations/helpers.py index bb59019e5f9..3c6c20a73ce 100644 --- a/tests/torch/function_hook/sparsify_activations/helpers.py +++ b/tests/torch/function_hook/sparsify_activations/helpers.py @@ -15,7 +15,6 @@ import openvino as ov import torch import torch.nn as nn -import transformers.models from nncf import IgnoredScope from nncf.experimental.torch.sparsify_activations import TargetScope @@ -36,21 +35,6 @@ def forward(self, input_ids: torch.Tensor): return y0, y1 -def dummy_llama_model(): - config = transformers.models.llama.configuration_llama.LlamaConfig( - vocab_size=32, - hidden_size=8, - intermediate_size=14, - num_attention_heads=2, - num_key_value_heads=1, - num_hidden_layers=2, - use_cache=False, - return_dict=False, - ) - model = transformers.AutoModelForCausalLM.from_config(config, attn_implementation="eager") - return model - - def count_sparsifier_patterns_in_ov(model: ov.Model) -> int: """ Counts the number of activation sparsification pattern "Abs -> LessEqual -> Select" diff --git a/tests/torch/function_hook/sparsify_activations/test_algo.py b/tests/torch/function_hook/sparsify_activations/test_algo.py index 66092acbbbb..8ca14b6baf7 100644 --- a/tests/torch/function_hook/sparsify_activations/test_algo.py +++ b/tests/torch/function_hook/sparsify_activations/test_algo.py @@ -32,7 +32,6 @@ from tests.cross_fw.shared.paths import TEST_ROOT from tests.torch.function_hook.sparsify_activations.helpers import ThreeLinearModel from tests.torch.function_hook.sparsify_activations.helpers import count_sparsifier_patterns_in_ov -from tests.torch.function_hook.sparsify_activations.helpers import dummy_llama_model from tests.torch.helpers import set_torch_seed from tests.torch.utils import compare_with_reference_file from tests.torch.utils import to_comparable_nx_graph @@ -100,24 +99,6 @@ def __str__(self) -> str: ref_num_batches_tracked=3, ref_num_patterns_in_ov=2, ), - SparsifyActivationsAlgorithmTestDesc( - name="dummy_llama", - model_getter=dummy_llama_model, - dataset_getter=lambda device: nncf.Dataset(torch.randint(0, 30, (3, 2, 8)).to(device)), - target_sparsity_by_scope={ - TargetScope(patterns=[".*gate_proj.*"]): 0.2, - TargetScope(patterns=[".*up_proj.*"]): 0.3, - TargetScope(patterns=[".*down_proj.*"]): 0.4, - }, - ignored_scope=None, - ref_sparsifier_target_sparsity={ - (f"pre_hooks.model/mlp/{name}/linear/{layer_id}__0.0"): sparsity - for name, sparsity in [("gate_proj", 0.2), ("up_proj", 0.3), ("down_proj", 0.4)] - for layer_id in [0, 1] - }, - ref_num_batches_tracked=3, - ref_num_patterns_in_ov=6, - ), ] From fbfada8f88dddae6b62d4a81bdc8a4c6288468f3 Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Wed, 17 Jun 2026 11:27:07 +0300 Subject: [PATCH 4/8] transformers==5.0.0 --- examples/llm_compression/onnx/tiny_llama/requirements.txt | 2 +- .../onnx/tiny_llama_scale_estimation/requirements.txt | 2 +- .../openvino/smollm2_360m_adaptive_codebook/requirements.txt | 2 +- .../openvino/smollm2_360m_codebook/requirements.txt | 2 +- .../llm_compression/openvino/smollm2_360m_fp8/requirements.txt | 2 +- examples/llm_compression/openvino/tiny_llama/requirements.txt | 2 +- .../openvino/tiny_llama_find_hyperparams/requirements.txt | 2 +- .../openvino/tiny_llama_synthetic_data/requirements.txt | 2 +- .../torch/distillation_qat_with_lora/requirements.txt | 2 +- .../torch/downstream_qat_with_nls/requirements.txt | 2 +- examples/llm_compression/torch_fx/tiny_llama/requirements.txt | 2 +- tests/openvino/requirements.txt | 2 +- tests/post_training/requirements.txt | 2 +- tests/torch/requirements.txt | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/llm_compression/onnx/tiny_llama/requirements.txt b/examples/llm_compression/onnx/tiny_llama/requirements.txt index 8a638996e0b..9b46ee4bb94 100644 --- a/examples/llm_compression/onnx/tiny_llama/requirements.txt +++ b/examples/llm_compression/onnx/tiny_llama/requirements.txt @@ -1,4 +1,4 @@ -transformers==5.0.0rc3 +transformers==5.0.0 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 diff --git a/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt b/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt index afba286677b..1dd5c782892 100644 --- a/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt +++ b/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt @@ -1,5 +1,5 @@ torch==2.10.0 -transformers==5.0.0rc3 +transformers==5.0.0 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 diff --git a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt index b7687ba3021..52766d73c6e 100644 --- a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt +++ b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt @@ -2,7 +2,7 @@ datasets==5.0.0 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 -transformers==5.0.0rc3 +transformers==5.0.0 onnx==1.21.0 torch==2.10.0 torchvision==0.25.0 diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt index cefabe36a41..6efdf0c0a1a 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt @@ -1,7 +1,7 @@ openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 -transformers==5.0.0rc3 +transformers==5.0.0 onnx==1.21.0 torch==2.10.0 torchvision==0.25.0 diff --git a/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt index 6de61fb412e..69ef9c00941 100644 --- a/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt +++ b/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt @@ -2,6 +2,6 @@ datasets==5.0.0 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 -transformers==5.0.0rc3 +transformers==5.0.0 onnx==1.21.0 torch==2.10.0 diff --git a/examples/llm_compression/openvino/tiny_llama/requirements.txt b/examples/llm_compression/openvino/tiny_llama/requirements.txt index 74478f2302f..ef6f0f5e08a 100644 --- a/examples/llm_compression/openvino/tiny_llama/requirements.txt +++ b/examples/llm_compression/openvino/tiny_llama/requirements.txt @@ -4,4 +4,4 @@ openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 torch==2.10.0 -transformers==5.0.0rc3 +transformers==5.0.0 diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt index 277f2be63f9..38aa0a66466 100644 --- a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt +++ b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt @@ -3,6 +3,6 @@ numpy==1.26.4 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==5.0.0rc3 +transformers==5.0.0 onnx==1.21.0 torch==2.10.0 diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt b/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt index 1005d12956f..b1127e728f2 100644 --- a/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt +++ b/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt @@ -4,5 +4,5 @@ numpy>=1.23.5,<2 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==5.0.0rc3 +transformers==5.0.0 onnx==1.21.0 diff --git a/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt b/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt index 2d38b27eb99..e36b747f735 100644 --- a/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt +++ b/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt @@ -5,5 +5,5 @@ numpy>=1.23.5,<2 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==5.0.0rc3 +transformers==5.0.0 lm_eval==0.4.12 diff --git a/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt b/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt index f79199198dd..d19621b7c10 100644 --- a/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt +++ b/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt @@ -4,6 +4,6 @@ numpy>=1.23.5,<2 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==5.0.0rc3 +transformers==5.0.0 lm_eval==0.4.12 torchao==0.17.0 diff --git a/examples/llm_compression/torch_fx/tiny_llama/requirements.txt b/examples/llm_compression/torch_fx/tiny_llama/requirements.txt index d196e50a8a5..a68ebd90210 100644 --- a/examples/llm_compression/torch_fx/tiny_llama/requirements.txt +++ b/examples/llm_compression/torch_fx/tiny_llama/requirements.txt @@ -1,4 +1,4 @@ -transformers==5.0.0rc3 +transformers==5.0.0 datasets==5.0.0 openvino==2026.2.0 optimum==2.2.0 diff --git a/tests/openvino/requirements.txt b/tests/openvino/requirements.txt index e76d7f4f39f..c5dc1472900 100644 --- a/tests/openvino/requirements.txt +++ b/tests/openvino/requirements.txt @@ -15,6 +15,6 @@ addict>=2.4.0 timm==0.9.2 efficientnet_pytorch==0.7.1 datasets -transformers==5.0.0rc3 +transformers==5.0.0 optimum-intel==2.0.0 optimum==2.2.0 diff --git a/tests/post_training/requirements.txt b/tests/post_training/requirements.txt index 9d0ad5426a6..f00adbd049a 100644 --- a/tests/post_training/requirements.txt +++ b/tests/post_training/requirements.txt @@ -25,5 +25,5 @@ tensorboard==2.20.0 tensorflow-io==0.37.1 timm==0.9.2 accelerate==1.9.0 -transformers==5.0.0rc3 +transformers==5.0.0 whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@light_req#subdirectory=tools/who_what_benchmark diff --git a/tests/torch/requirements.txt b/tests/torch/requirements.txt index 2d14049c84e..73c22107643 100644 --- a/tests/torch/requirements.txt +++ b/tests/torch/requirements.txt @@ -14,7 +14,7 @@ torchvision addict>=2.4.0 efficientnet_pytorch==0.7.1 -transformers==5.0.0rc3 +transformers==5.0.0 sentence-transformers==5.6.0 optimum-intel==2.0.0 From bc5e2dc8bfbdec7bfdd0836a4be39829bf295228 Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Wed, 17 Jun 2026 11:30:57 +0300 Subject: [PATCH 5/8] f --- examples/llm_compression/onnx/tiny_llama/main.py | 6 ------ .../onnx/tiny_llama_scale_estimation/main.py | 7 ------- .../openvino/smollm2_360m_adaptive_codebook/main.py | 6 ------ .../llm_compression/openvino/smollm2_360m_codebook/main.py | 6 ------ examples/llm_compression/openvino/tiny_llama/main.py | 6 ------ .../openvino/tiny_llama_find_hyperparams/main.py | 6 ------ .../openvino/tiny_llama_synthetic_data/main.py | 6 ------ .../test_weights_compression_statistics_caching.py | 6 ------ 8 files changed, 49 deletions(-) diff --git a/examples/llm_compression/onnx/tiny_llama/main.py b/examples/llm_compression/onnx/tiny_llama/main.py index e90fe651f28..90f1b475ac6 100644 --- a/examples/llm_compression/onnx/tiny_llama/main.py +++ b/examples/llm_compression/onnx/tiny_llama/main.py @@ -14,10 +14,6 @@ from pathlib import Path import onnx - -# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel -# WA for https://github.com/huggingface/optimum-intel/pull/1798 -import optimum.intel.utils.import_utils as import_utils from optimum.intel.openvino import OVModelForCausalLM from optimum.onnxruntime import ORTModelForCausalLM from transformers import AutoTokenizer @@ -25,8 +21,6 @@ import nncf from nncf.onnx.quantization.backend_parameters import BackendParameters -import_utils._transformers_version = "5.0.0" - ROOT = Path(__file__).parent.resolve() diff --git a/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py b/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py index 6e51b0dcb06..818c8610248 100644 --- a/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py +++ b/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py @@ -16,10 +16,6 @@ import numpy as np import onnx - -# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel -# WA for https://github.com/huggingface/optimum-intel/pull/1798 -import optimum.intel.utils.import_utils as import_utils import torch from datasets import load_dataset from optimum.intel.openvino import OVModelForCausalLM @@ -31,9 +27,6 @@ import nncf from nncf.onnx.quantization.backend_parameters import BackendParameters -import_utils._transformers_version = "5.0.0" - - ROOT = Path(__file__).parent.resolve() MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" OUTPUT_DIR = ROOT / "tinyllama_compressed" diff --git a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py index 780828c2680..8975609e00d 100644 --- a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py @@ -14,10 +14,6 @@ import datasets import numpy as np - -# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel -# WA for https://github.com/huggingface/optimum-intel/pull/1798 -import optimum.intel.utils.import_utils as import_utils from optimum.intel.openvino import OVModelForCausalLM from scipy.stats import norm from torch.jit import TracerWarning @@ -27,8 +23,6 @@ import nncf from nncf.quantization.advanced_parameters import AdvancedAdaptiveCodebookParameters -import_utils._transformers_version = "5.0.0" - logging.set_verbosity_error() warnings.filterwarnings("ignore", category=TracerWarning) diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 849dc22d9e2..9711304e9dc 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -12,10 +12,6 @@ import warnings import numpy as np - -# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel -# WA for https://github.com/huggingface/optimum-intel/pull/1798 -import optimum.intel.utils.import_utils as import_utils from optimum.intel.openvino import OVModelForCausalLM from torch.jit import TracerWarning from transformers import AutoTokenizer @@ -23,8 +19,6 @@ import nncf -import_utils._transformers_version = "5.0.0" - logging.set_verbosity_error() warnings.filterwarnings("ignore", category=TracerWarning) diff --git a/examples/llm_compression/openvino/tiny_llama/main.py b/examples/llm_compression/openvino/tiny_llama/main.py index be4736ca242..14224dba5d4 100644 --- a/examples/llm_compression/openvino/tiny_llama/main.py +++ b/examples/llm_compression/openvino/tiny_llama/main.py @@ -12,18 +12,12 @@ from functools import partial import numpy as np - -# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel -# WA for https://github.com/huggingface/optimum-intel/pull/1798 -import optimum.intel.utils.import_utils as import_utils from datasets import load_dataset from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer import nncf -import_utils._transformers_version = "5.0.0" - def main(): MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py index ff55b1b1530..7705073bbe9 100644 --- a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py +++ b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py @@ -17,10 +17,6 @@ import numpy as np import openvino as ov - -# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel -# WA for https://github.com/huggingface/optimum-intel/pull/1798 -import optimum.intel.utils.import_utils as import_utils from datasets import load_dataset from optimum.intel import OVModelForCausalLM from transformers import AutoTokenizer @@ -30,8 +26,6 @@ from nncf.common.logging import nncf_logger from nncf.quantization.advanced_parameters import AdvancedCompressionParameters -import_utils._transformers_version = "5.0.0" - DataItem = TypeVar("DataItem") ModelInput = TypeVar("ModelInput") diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py index 3f22faafae4..25faa00ee28 100644 --- a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py +++ b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py @@ -12,18 +12,12 @@ from functools import partial import numpy as np - -# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel -# WA for https://github.com/huggingface/optimum-intel/pull/1798 -import optimum.intel.utils.import_utils as import_utils import torch from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer import nncf -import_utils._transformers_version = "5.0.0" - SEED = 0 diff --git a/tests/openvino/native/quantization/test_weights_compression_statistics_caching.py b/tests/openvino/native/quantization/test_weights_compression_statistics_caching.py index f8043d906db..45264e71c1d 100644 --- a/tests/openvino/native/quantization/test_weights_compression_statistics_caching.py +++ b/tests/openvino/native/quantization/test_weights_compression_statistics_caching.py @@ -15,10 +15,6 @@ import datasets import numpy as np import openvino as ov - -# TODO(AlexanderDokuchaev): Remove this workaround when update transformers version or optimum-intel -# WA for https://github.com/huggingface/optimum-intel/pull/1798 -import optimum.intel.utils.import_utils as import_utils from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer @@ -26,8 +22,6 @@ from nncf.quantization.advanced_parameters import AdvancedCompressionParameters from nncf.scopes import IgnoredScope -import_utils._transformers_version = "5.0.0" - MODEL_ID = "hf-internal-testing/tiny-random-OPTForCausalLM" DEFAULT_RATIO = 0.4 DEFAULT_GROUP_SIZE = 4 From fd363b0d90ed4b93373816eb10f39c9db4085369 Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Wed, 17 Jun 2026 11:35:50 +0300 Subject: [PATCH 6/8] fx --- examples/llm_compression/torch_fx/tiny_llama/main.py | 2 +- examples/llm_compression/torch_fx/tiny_llama/modelling.py | 4 ++-- tests/post_training/pipelines/fx_modelling.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/llm_compression/torch_fx/tiny_llama/main.py b/examples/llm_compression/torch_fx/tiny_llama/main.py index 7b9255c200e..60cd9a06dec 100644 --- a/examples/llm_compression/torch_fx/tiny_llama/main.py +++ b/examples/llm_compression/torch_fx/tiny_llama/main.py @@ -71,7 +71,7 @@ def main() -> str: ] input_ids = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ) + )["input_ids"] print("Warmup...") output = compressed_model_hf.generate(input_ids) diff --git a/examples/llm_compression/torch_fx/tiny_llama/modelling.py b/examples/llm_compression/torch_fx/tiny_llama/modelling.py index 484bba44043..d5cfeeb1e76 100644 --- a/examples/llm_compression/torch_fx/tiny_llama/modelling.py +++ b/examples/llm_compression/torch_fx/tiny_llama/modelling.py @@ -17,7 +17,6 @@ from transformers import GenerationMixin from transformers import PretrainedConfig from transformers import PreTrainedModel -from transformers.cache_utils import StaticCacheConfig from transformers.integrations.executorch import TorchExportableModuleWithStaticCache from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.models.llama.configuration_llama import LlamaConfig @@ -97,8 +96,9 @@ def convert_and_export_with_cache(model: PreTrainedModel) -> tuple[ExportedProgr example_cache_position = torch.arange(0, 8, dtype=torch.long) model_config = None gen_config = None + model.generation_config.use_cache = True model.generation_config.cache_implementation = "static" - model.generation_config.cache_config = StaticCacheConfig(batch_size=1, max_cache_len=512) + model.generation_config.cache_config = {"batch_size": 1, "max_cache_len": 512} model.generation_config.max_new_tokens = 100 gen_config = model.generation_config model_config = model.config diff --git a/tests/post_training/pipelines/fx_modelling.py b/tests/post_training/pipelines/fx_modelling.py index 559cfb7ad76..61b077d6761 100644 --- a/tests/post_training/pipelines/fx_modelling.py +++ b/tests/post_training/pipelines/fx_modelling.py @@ -16,7 +16,6 @@ from transformers import GenerationMixin from transformers import PretrainedConfig from transformers import PreTrainedModel -from transformers.cache_utils import StaticCacheConfig from transformers.integrations.executorch import TorchExportableModuleWithStaticCache from transformers.modeling_outputs import CausalLMOutputWithPast @@ -84,8 +83,9 @@ def convert_and_export_with_cache(model: PreTrainedModel): example_input_ids = torch.ones(1, 8, dtype=torch.long) example_cache_position = torch.arange(0, 8, dtype=torch.long) + model.generation_config.use_cache = True model.generation_config.cache_implementation = "static" - model.generation_config.cache_config = StaticCacheConfig(batch_size=1, max_cache_len=512) + model.generation_config.cache_config = {"batch_size": 1, "max_cache_len": 512} model.generation_config.max_new_tokens = 100 gen_config = model.generation_config model_config = model.config From 3febab359c2bda1b12a003eab91e5c6ad01955f9 Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Wed, 17 Jun 2026 13:26:00 +0300 Subject: [PATCH 7/8] f --- examples/llm_compression/onnx/tiny_llama/main.py | 5 +++-- .../llm_compression/onnx/tiny_llama_scale_estimation/main.py | 5 +++-- .../openvino/smollm2_360m_adaptive_codebook/main.py | 5 +++-- .../llm_compression/openvino/smollm2_360m_codebook/main.py | 5 +++-- examples/llm_compression/openvino/smollm2_360m_fp8/main.py | 5 +++-- examples/llm_compression/openvino/tiny_llama/main.py | 5 +++-- .../openvino/tiny_llama_find_hyperparams/requirements.txt | 2 +- .../torch/distillation_qat_with_lora/requirements.txt | 2 +- .../torch/downstream_qat_with_nls/requirements.txt | 2 +- examples/llm_compression/torch_fx/tiny_llama/main.py | 5 +++-- tests/post_training/requirements.txt | 2 +- 11 files changed, 25 insertions(+), 18 deletions(-) diff --git a/examples/llm_compression/onnx/tiny_llama/main.py b/examples/llm_compression/onnx/tiny_llama/main.py index 90f1b475ac6..6ee571882da 100644 --- a/examples/llm_compression/onnx/tiny_llama/main.py +++ b/examples/llm_compression/onnx/tiny_llama/main.py @@ -66,9 +66,10 @@ def main(): ov_model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, from_onnx=True) messages = [{"role": "user", "content": "What is PyTorch?"}] - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] start_t = time.time() output = ov_model.generate(input_ids, max_new_tokens=100) diff --git a/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py b/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py index 818c8610248..bfd2d5e4010 100644 --- a/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py +++ b/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py @@ -119,9 +119,10 @@ def main(): # Infer Model. ov_model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, from_onnx=True) messages = [{"role": "user", "content": "What is PyTorch?"}] - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] start_t = time.time() output = ov_model.generate(input_ids, max_new_tokens=100) diff --git a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py index 8975609e00d..63ab7a94667 100644 --- a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py @@ -151,9 +151,10 @@ def generate_answers( for question in questions: messages.append({"role": "user", "content": question}) - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] input_len = len(input_ids[0]) output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0] diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 9711304e9dc..caaf00157c7 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -48,9 +48,10 @@ def generate_answers( for question in questions: messages.append({"role": "user", "content": question}) - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] input_len = len(input_ids[0]) output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0] diff --git a/examples/llm_compression/openvino/smollm2_360m_fp8/main.py b/examples/llm_compression/openvino/smollm2_360m_fp8/main.py index d3558d9a78b..92c500424b1 100644 --- a/examples/llm_compression/openvino/smollm2_360m_fp8/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_fp8/main.py @@ -47,9 +47,10 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50): for question in questions: messages.append({"role": "user", "content": question}) - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] input_len = len(input_ids[0]) output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0] diff --git a/examples/llm_compression/openvino/tiny_llama/main.py b/examples/llm_compression/openvino/tiny_llama/main.py index 14224dba5d4..45d8c031ce5 100644 --- a/examples/llm_compression/openvino/tiny_llama/main.py +++ b/examples/llm_compression/openvino/tiny_llama/main.py @@ -60,9 +60,10 @@ def transform_fn(data, tokenizer): model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR) messages = [{"role": "user", "content": "What is PyTorch?"}] - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] start_t = time.time() output = model.generate(input_ids, max_new_tokens=100) diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt index 38aa0a66466..19b71428934 100644 --- a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt +++ b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt @@ -1,4 +1,4 @@ -whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@light_req#subdirectory=tools/who_what_benchmark +whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@a26c9b20d07209f035f6d74aeae94d6a72a132ab#subdirectory=tools/who_what_benchmark numpy==1.26.4 openvino==2026.2.0 optimum-intel==2.0.0 diff --git a/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt b/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt index e36b747f735..90bf5bf06a2 100644 --- a/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt +++ b/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt @@ -6,4 +6,4 @@ openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 transformers==5.0.0 -lm_eval==0.4.12 +lm_eval[hf]==0.4.12 diff --git a/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt b/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt index d19621b7c10..66d01cea870 100644 --- a/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt +++ b/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt @@ -5,5 +5,5 @@ openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 transformers==5.0.0 -lm_eval==0.4.12 +lm_eval[hf]==0.4.12 torchao==0.17.0 diff --git a/examples/llm_compression/torch_fx/tiny_llama/main.py b/examples/llm_compression/torch_fx/tiny_llama/main.py index 60cd9a06dec..2c0a403e3ce 100644 --- a/examples/llm_compression/torch_fx/tiny_llama/main.py +++ b/examples/llm_compression/torch_fx/tiny_llama/main.py @@ -69,9 +69,10 @@ def main() -> str: }, {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, ] - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - )["input_ids"] + ) + input_ids = batch_feature["input_ids"] print("Warmup...") output = compressed_model_hf.generate(input_ids) diff --git a/tests/post_training/requirements.txt b/tests/post_training/requirements.txt index f00adbd049a..abd23217f3b 100644 --- a/tests/post_training/requirements.txt +++ b/tests/post_training/requirements.txt @@ -26,4 +26,4 @@ tensorflow-io==0.37.1 timm==0.9.2 accelerate==1.9.0 transformers==5.0.0 -whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@light_req#subdirectory=tools/who_what_benchmark +whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@a26c9b20d07209f035f6d74aeae94d6a72a132ab#subdirectory=tools/who_what_benchmark From 5e6662c693e8eebdee21a6f95811b181a3cd323f Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Thu, 18 Jun 2026 18:07:42 +0300 Subject: [PATCH 8/8] gpt --- tests/post_training/pipelines/gpt.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tests/post_training/pipelines/gpt.py b/tests/post_training/pipelines/gpt.py index 57878eea479..779036f2946 100644 --- a/tests/post_training/pipelines/gpt.py +++ b/tests/post_training/pipelines/gpt.py @@ -24,14 +24,37 @@ from tests.post_training.pipelines.base import PTQTestPipeline +# TODO(AlexandrDokuchaev): Remove this wrapper when the issue with torch.jit.trace and transformers>=5.0 is fixed +class CausalLMTracingWrapper(torch.nn.Module): + """ + Wraps a Hugging Face causal language model so it can be exported via ``torch.jit.trace`` + (used internally by ``openvino.convert_model``). + + Since ``transformers>=5.0`` a causal LM forward pass returns a ``DynamicCache`` in its output + and builds the attention mask with a ``torch.diff``-based packed-sequence check when no + attention mask is provided. Neither construct is supported by ``torch.jit.trace`` / the + OpenVINO PyTorch frontend. + """ + + def __init__(self, model: torch.nn.Module) -> None: + super().__init__() + self.model = model + self.model.config.use_cache = False + + def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None) -> torch.Tensor: + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + return self.model(input_ids=input_ids, attention_mask=attention_mask) + + class GPT(PTQTestPipeline): """Pipeline for causal language models from Hugging Face repository""" def prepare_model(self) -> None: if self.backend in PT_BACKENDS: self.model_hf = transformers.AutoModelForCausalLM.from_pretrained(self.model_id) - self.model = self.model_hf - self.model.config.torchscript = True # Set to export by convert_model via torch.jit.trace + self.model = CausalLMTracingWrapper(self.model_hf) + self.dummy_tensor = self.model_hf.dummy_inputs["input_ids"] elif self.backend in OV_BACKENDS + [BackendType.FP32]: