diff --git a/src/nncf/quantization/algorithms/weight_compression/activation_stats.py b/src/nncf/quantization/algorithms/weight_compression/activation_stats.py index 0e5860942fd..f2a084688b0 100644 --- a/src/nncf/quantization/algorithms/weight_compression/activation_stats.py +++ b/src/nncf/quantization/algorithms/weight_compression/activation_stats.py @@ -17,16 +17,23 @@ from nncf.tensor import functions as fns -def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int = -1) -> tuple[Tensor, Tensor]: +def process_stats( + stats: WCTensorStatistic, + subset_size: int, + act_ch_axis: int = -1, + transpose_a: bool = False, +) -> tuple[Tensor, Tensor]: """ A function for processing activations. Shared between AWQ, Scale Estimation and LoRA Correction algorithms. :param stats: An object containing statistics for the layer. :param subset_size: The number of samples for AWQ. If subset_size <= 0, all samples are used. :param act_ch_axis: The activation channel axis. + :param transpose_a: When True, returns X in [SampleSize, HiddenDim] layout instead of the default + [HiddenDim, SampleSize]. Used by LoRA Correction which requires samples as rows. :return: tuple of the following tensors: - s - maximum channel magnitude across samples [HiddenDim] - X - average channel magnitude across tokens in the sequence [HiddenDim, min(SampleSize, ~subset_size)] + s - maximum channel magnitude across samples, shape [HiddenDim] + X - activation matrix, shape [HiddenDim, SampleSize] normally or [SampleSize, HiddenDim] if transpose_a=True """ X = fns.stack( stats.mean_values @@ -37,8 +44,13 @@ def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int = axes = list(range(1, len(X.shape))) + [0] X_full = fns.transpose(X, axes=axes) - # The sample dimension is always the last axis after transpose - sample_axis = -1 + if transpose_a: + axes = list(range(len(X_full.shape))) + axes[-1], axes[-2] = axes[-2], axes[-1] + X_full = fns.transpose(X_full, axes=axes) + + # The sample dimension is axis -1 by default, but moves to -2 if transpose_a is True + sample_axis = -2 if transpose_a else -1 # Prevent high memory and time consumption by sampling if X_full.shape[sample_axis] > subset_size and subset_size > 0: @@ -47,11 +59,13 @@ def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int = ] step = X_full.shape[sample_axis] // subset_size idxs = [i[0] for i in sorted(enumerate(lens), key=lambda x: -x[1])][::step] - X = X_full[..., idxs] + if transpose_a: + X = X_full[..., idxs, :] + else: + X = X_full[..., idxs] else: X = X_full - # Compute max magnitude along the sample axis (last axis) - # Result: [HiddenDim] or [No. of Experts, HiddenDim] + # Compute max magnitude along the sample axis s = fns.max(fns.abs(X_full), axis=sample_axis) return s, X diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index 97cf37be150..8985a182c67 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -1181,11 +1181,6 @@ def apply_with_parameters( ) if self._lora_correction: - for wc_params in all_weight_params: - if self._backend_entity.matmul_has_transposed_activations(wc_params.node_with_weight, graph): - msg = "Transposed activations are not supported yet for the LoRa correction algorithm" - raise nncf.UnsupportedModelError(msg) - lora_correction_params = self._advanced_parameters.lora_correction_params lora_correction_algo = LoraCorrectionAlgorithm(statistics, lora_correction_params) description += " with correction of low-rank adapters" @@ -1399,7 +1394,7 @@ def _get_statistics_for_weights_compression( # Where mean_value is a 1D tensor representing an activation reduced over batch and sequence length dimensions, # shape is an original shape of an activation before reduction, n is the size of the dataset (or subset_size). statistics = {} - for (act_node, output_port_id, _), matmul_nodes in matmul_input_to_output_nodes_map.items(): + for (act_node, output_port_id, _act_channel_axis), matmul_nodes in matmul_input_to_output_nodes_map.items(): tensor_collectors = list( statistic_points.get_algo_statistics_for_node( act_node.node_name, diff --git a/src/nncf/quantization/algorithms/weight_compression/lora_correction.py b/src/nncf/quantization/algorithms/weight_compression/lora_correction.py index 1c8bcbb63b1..785c89076de 100644 --- a/src/nncf/quantization/algorithms/weight_compression/lora_correction.py +++ b/src/nncf/quantization/algorithms/weight_compression/lora_correction.py @@ -106,7 +106,11 @@ def is_applicable(self, wc_params: WeightCompressionParameters): return wc_params.compression_config.num_bits == 4 def calculate_adapters( - self, weight: Tensor, compressed_weight: CompressedWeight, wc_params: WeightCompressionParameters + self, + weight: Tensor, + compressed_weight: CompressedWeight, + wc_params: WeightCompressionParameters, + act_ch_axis: int, ) -> tuple[Tensor, Tensor, list[float]]: """ Calculates low rank matrices for a given original and compressed weights. @@ -114,6 +118,7 @@ def calculate_adapters( :param weight: original floating-point weight matrix. :param compressed_weight: compressed weight matrix. :param wc_params: parameters of weight compression. + :param act_ch_axis: axis number of the activation tensor which correspond to it channel. :return: two low rank matrices in the order of execution of corresponding linear layers. """ layer_name = wc_params.node_with_weight.node_name @@ -126,6 +131,7 @@ def calculate_adapters( wc_params.reduction_axes, self._lora_correction_params, layer_statistics, + act_ch_axis, is_debug, ) if is_debug: @@ -140,6 +146,7 @@ def calculate_low_rank_matrices( reduction_axes: tuple[int, ...], lora_correction_params: AdvancedLoraCorrectionParameters, layer_statistics: WCTensorStatistic, + act_ch_axis: int, is_debug: bool | None = False, ): """ @@ -155,6 +162,7 @@ def calculate_low_rank_matrices( :param reduction_axes: axes along which different statistics reduced. :param lora_correction_params: parameters to configure the algorithm. :param layer_statistics: an object containing statistics for the layer. + :param act_ch_axis: axis number of the activation tensor which correspond to it channel. :param is_debug: whether to collect debug information, defaults to False. :return: two low rank matrices in the order of execution of corresponding linear layers and list of mean noises. Noises are collected from each step of the algorithm if debug was enabled. @@ -168,7 +176,12 @@ def calculate_low_rank_matrices( ) mode = compression_config.mode assert len(reduction_axes) == 1, "Assumed a single reduction axis" - reduction_axis = reduction_axes[0] if compression_config.group_size != -1 else -1 + + if compression_config.group_size != -1: + reduction_axis = reduction_axes[0] + else: + reduction_axis = -1 + if mode in (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM): fq_weights = do_integer_dequantization( compressed_weight, @@ -190,8 +203,8 @@ def calculate_low_rank_matrices( svd_residual = fns.transpose(svd_residual) residual = svd_residual.clone() # [H, O] - s, X = process_stats(layer_statistics, subset_size) # [H], [H, SS] - X = fns.transpose(X) # [SS, H] + # Pass it to process_stats with transpose_a=True to get [SS, H] layout + s, X = process_stats(layer_statistics, subset_size, act_ch_axis, transpose_a=True) if compression_config.group_size > 0: # Multiply residual of weights by maximum channel magnitude of activations normalized per quantization # group. As a consequence, weights corresponding to a "noisy" activations has a higher error to correct. diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index cf86e2e00dd..98d22529788 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -204,7 +204,8 @@ def insert_adapters( A_W = opset.constant(lora_A.data) B_W = opset.constant(lora_B.data) - A_MM = opset.matmul(input_node, A_W, transpose_a=False, transpose_b=True) + transpose_a = wc_params.node_with_weight.layer_attributes.input_attributes["transpose"] + A_MM = opset.matmul(input_node, A_W, transpose_a=transpose_a, transpose_b=True) B_MM = opset.matmul(A_MM, B_W, transpose_a=False, transpose_b=True) node_output_port = mm_node.output(0) @@ -361,7 +362,15 @@ def transform_model( compressed_weight.tensor = compressed_weight.tensor.as_numpy_tensor() if compressed_weight.zero_point is not None: compressed_weight.zero_point = compressed_weight.zero_point.as_numpy_tensor() - adapters = lora_correction_algo.calculate_adapters(weight, compressed_weight, wc_params) + + activation_port_id = self.get_activation_port_id(wc_params.node_with_weight, graph) + activation_edge = graph.get_input_edge_by_port_id(wc_params.node_with_weight, activation_port_id) + activation_shape = activation_edge.tensor_shape + act_ch_axis = self.get_activation_channel_axis( + wc_params.node_with_weight, activation_port_id, activation_shape + ) + + adapters = lora_correction_algo.calculate_adapters(weight, compressed_weight, wc_params, act_ch_axis) self.insert_adapters(wc_params, *adapters, int8_lora=lora_correction_algo.use_int8_adapters) self.name_to_node_mapping = None diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 7f5322b2ac5..d32dc0dcd01 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -29,6 +29,7 @@ from nncf import SensitivityMetric from nncf.common.factory import build_graph from nncf.common.tensor_statistics.collectors import AggregatorBase +from nncf.common.tensor_statistics.statistics import WCTensorStatistic from nncf.common.utils.debug import nncf_debug from nncf.common.utils.helpers import set_env_variable from nncf.data.dataset import Dataset @@ -45,6 +46,7 @@ from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams from nncf.quantization.advanced_parameters import GroupSizeFallbackMode +from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -1703,13 +1705,97 @@ def test_call_max_var_criterion_with_dataset_gptq_neg_group_size(mode): assert op.get_shape() == [sz, 1] +@pytest.mark.parametrize("act_ch_axis", [0, -1]) +def test_process_stats_with_transpose_a_changes_layout(act_ch_axis): + activations = [np.random.randn(8).astype(np.float32) for _ in range(10)] + stats = WCTensorStatistic( + [Tensor(act) for act in activations], + [(1, 8) if act_ch_axis == 0 else (10, 8)], # Dummy shapes for process_stats sampling logic + ) + + subset_size = 10 + + # Case 1: transpose_a=False + s_default, X_default = process_stats( + stats, + subset_size=subset_size, + act_ch_axis=act_ch_axis, + transpose_a=False, + ) + + # Case 2: transpose_a=True + s_transposed, X_transposed = process_stats( + stats, + subset_size=subset_size, + act_ch_axis=act_ch_axis, + transpose_a=True, + ) + + # Hidden dimensions must be preserved in max magnitude + assert s_default.shape == s_transposed.shape + + # Layout must be effectively transposed for X + # process_stats returns [HiddenDim, SampleSize] by default + # If transpose_a=True, it returns [SampleSize, HiddenDim] + assert X_default.shape[0] == X_transposed.shape[1] + assert X_default.shape[1] == X_transposed.shape[0] + + +@pytest.mark.parametrize("transpose_a", [False, True]) +@pytest.mark.parametrize("transpose_b", [False, True]) +def test_lora_transpose_a_fix(transpose_a, transpose_b): + """ + Test LoRA compression works correctly with transpose_a and transpose_b configurations. + """ + # Setup LoRA parameters + params = LoraParams(adapter_rank=4, use_int8_adapters=False) + advanced_parameters = CompressionParams(lora_correction_params=params) + + # Initialize model with given transpose configuration + input_shape = [1, 16, 16] # [batch, seq, hidden] + model = LMLinearModel(transpose_b=transpose_b, transpose_a=transpose_a, input_shape=input_shape) + ov_model = model.ov_model + dataset = Dataset(np.ones(input_shape, dtype=np.float32) for _ in ov_model.inputs) + + # Compress weights with LoRA correction enabled + compressed_model = compress_weights( + ov_model, + mode=CompressWeightsMode.INT4_SYM, + ratio=1.0, + group_size=8, + dataset=dataset, + all_layers=True, + lora_correction=True, + advanced_parameters=advanced_parameters, + ) + + # Simple assertion: compressed model is returned and has expected nodes + assert compressed_model is not None + # Verify that LoRA adapters were added (3 MatMuls instead of 1) + matmuls = [node for node in compressed_model.get_ops() if node.get_type_name() == "MatMul"] + assert len(matmuls) == 3 + + @pytest.mark.parametrize( - "params, transpose_b", - ((None, True), (LoraParams(adapter_rank=4, use_int8_adapters=False), False)), + "params, transpose_a, transpose_b", + ( + (None, False, True), # original + (LoraParams(adapter_rank=4, use_int8_adapters=False), False, False), # original + pytest.param( + LoraParams(adapter_rank=4, use_int8_adapters=False), + True, + False, + ), + pytest.param( + LoraParams(adapter_rank=8, use_int8_adapters=True), + True, + True, + ), + ), ) -def test_lora_adapters_in_the_graph(params, transpose_b): +def test_lora_adapters_in_the_graph(params, transpose_a, transpose_b): advanced_parameters = CompressionParams() if params is None else CompressionParams(lora_correction_params=params) - model = LMLinearModel(transpose_b=transpose_b) + model = LMLinearModel(transpose_a=transpose_a, transpose_b=transpose_b) ov_model = model.ov_model dataset = Dataset(np.ones(inp.shape) for inp in ov_model.inputs) @@ -1887,7 +1973,7 @@ def test_compression_with_lora_with_subset_size(mocker): get_stats_spy.assert_called_once() s, X = get_stats_spy.spy_return - assert X.shape == (model.hidden_dim, subset_size) + assert X.shape == (subset_size, model.hidden_dim) assert s.shape == (model.hidden_dim,) @@ -2596,6 +2682,61 @@ def get_moe_scale_estimation_ref(check_sampling_activation_stats_flow): ), )[check_sampling_activation_stats_flow] + @pytest.mark.parametrize("is_moe", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))]) + @pytest.mark.parametrize("check_sampling_activation_stats_flow", [False, True]) + def test_scale_estimation(self, mocker, is_moe, check_sampling_activation_stats_flow): + return super().test_scale_estimation(mocker, is_moe, check_sampling_activation_stats_flow) + + @pytest.mark.parametrize( + "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))] + ) + def test_awq_with_ignored_scope(self, mocker, is_3d_weights): + return super().test_awq_with_ignored_scope(mocker, is_3d_weights) + + # Transpose inputs does not affect mergable pattern code + @pytest.mark.parametrize("transpose_a,non_mergable_pattern", [(True, True), (False, True), (False, False)]) + @pytest.mark.parametrize( + "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))] + ) + def test_awq_scale_reference( + self, + non_mergable_pattern, + transpose_a, + test_awq_scale_ref, + transpose_a_supported, + is_3d_weights, + monkeypatch, + mocker, + ): + return super().test_awq_scale_reference( + non_mergable_pattern, + transpose_a, + test_awq_scale_ref, + transpose_a_supported, + is_3d_weights, + monkeypatch, + mocker, + ) + + @pytest.mark.parametrize( + "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))] + ) + @pytest.mark.parametrize("dataset", [None, np.ones([2, 8, 8], dtype=np.float32)]) + @pytest.mark.parametrize("prefer_data_aware_scaling", [True, False]) + def test_data_free_awq(self, dataset, prefer_data_aware_scaling, is_3d_weights, mocker): + return super().test_data_free_awq(dataset, prefer_data_aware_scaling, is_3d_weights, mocker) + + @pytest.mark.parametrize( + "is_3d_weights", [False, pytest.param(True, marks=pytest.mark.xfail(reason="Ticket - 176465"))] + ) + @pytest.mark.parametrize("with_multiply", (True, False)) + def test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul( + self, int4_mode, with_multiply, is_3d_weights, mocker + ): + return super().test_call_max_var_criterion_with_dataset_by_default_awq_act_matmul( + int4_mode, with_multiply, is_3d_weights, mocker + ) + @staticmethod def get_orig_weight(model: ov.Model) -> Tensor: for op in model.get_ordered_ops(): @@ -2759,6 +2900,41 @@ def test_awq_scale_ref() -> list[dict[str, Tensor]]: def transpose_a_supported(self) -> bool: return True + @pytest.mark.parametrize( + "kwargs", + [ + dict(scale_estimation=True), + dict( + gptq=True, + advanced_parameters=CompressionParams(gptq_params=GPTQParams(subset_size=2)), + ), + ], + ) + def test_compression_skipped_with_transposed_activations(self, transpose_a_supported, kwargs): + if not transpose_a_supported: + pytest.skip("transpose_a is not supported for the current backend") + if kwargs.get("scale_estimation", False) and "scale_estimation" in self.get_not_supported_algorithms(): + pytest.skip("Scale estimation is not supported") + if kwargs.get("gptq", False) and "gptq" in self.get_not_supported_algorithms(): + pytest.skip("GPTQ is not supported") + + INPUT_SHAPE = (2, 4) + model = self.get_transposable_awq_model(transpose_a=True, transpose_b=True, input_shape=INPUT_SHAPE) + input = 0.01 * np.arange(0, np.multiply.reduce(INPUT_SHAPE), dtype=np.float32).reshape(INPUT_SHAPE) + 0.02 + input = self.to_tensor(input) + dataset = Dataset([input] * 2, self.get_transform_func()) + + with pytest.raises(nncf.UnsupportedModelError): + compress_weights( + model, + mode=CompressWeightsMode.INT4_SYM, + ratio=1.0, + group_size=1, + subset_size=2, + dataset=dataset, + all_layers=True, + **kwargs, + ) def test_phi_rope_model(self): model = Phi3dot5RoPEModel().ov_model compressed_model = compress_weights(