Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,127 changes: 179 additions & 1,948 deletions auto_round/compressors/base.py

Large diffs are not rendered by default.

36 changes: 36 additions & 0 deletions auto_round/quantizers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

from typing import TYPE_CHECKING

from auto_round.quantizers.base import QuantizerType, BaseQuantizer
from auto_round.quantizers.mode import TuningQuantizer, RTNQuantizer
from auto_round.quantizers.model_type import LLMQuantizer
from auto_round.quantizers.data_type import GGUFQuantizer

if TYPE_CHECKING:
from auto_round.compressors import BaseCompressor


def create_quantizer(cls: "BaseCompressor"):
# example
quantizers = {
# QuantizerType.DATA_TYPE: GGUFQuantizer,
# QuantizerType.MODEL_TYPE: LLMQuantizer,
QuantizerType.MODE: RTNQuantizer if cls.iters == 0 else TuningQuantizer,
}

dynamic_quantizer = type("AutoRoundQuantizer", tuple(quantizers.values()), {})
return dynamic_quantizer(cls)
145 changes: 145 additions & 0 deletions auto_round/quantizers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# Copyright (c) 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import traceback
import types
from enum import IntEnum
from typing import TYPE_CHECKING, Any, Callable, Union

import torch

from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
from auto_round.logger import logger
from auto_round.schemes import QuantizationScheme
from auto_round.utils import check_to_quantized, clear_memory

if TYPE_CHECKING:
from auto_round.compressors import BaseCompressor


class QuantizerType(IntEnum):
MODE = 1
MODEL_TYPE = 2
DATA_TYPE = 3


class BaseQuantizer:
_quantizer_classes: dict[QuantizerType, dict[str, type[BaseQuantizer]]] = {
QuantizerType.MODE: {},
QuantizerType.MODEL_TYPE: {},
QuantizerType.DATA_TYPE: {},
}
compressor: "BaseCompressor" = None

def __init__(self, compressor: "BaseCompressor"):
self.compressor = compressor

@staticmethod
def quanize(self):
"""Quantize the model and return the quantized model along with layer configurations.The entry of Quantizer.
Returns:
The quantized model and layer configurations.
"""
pass

def __getattr__(self, name):
if hasattr(self.compressor, name) and not isinstance(getattr(self.compressor, name), types.MethodType):
return getattr(self.compressor, name)
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")

def __setattr__(self, name, value):
if hasattr(self.compressor, name):
setattr(self.compressor, name, value)
else:
super().__setattr__(name, value)

@torch.inference_mode()
def _quantize_embedding_layer(self):
"""Quantizes embedding layers in the model according to the configuration.

This method iterates through all modules in the model, identifies embedding
layers specified in `self.layer_config`, and applies the appropriate quantization
function based on bit precision, grouping strategy, and dtype.

Returns:
bool: True if the quantization process completes without critical errors.
"""
is_quantized = False
for name, module in self.model.named_modules():
# Skip non-Embedding modules or layers not in config
if not isinstance(module, torch.nn.Embedding) or name not in self.layer_config:
continue

config = self.layer_config[name]

# Skip layers that are not marked for quantization
if not check_to_quantized(config):
continue
is_quantized = True
config["scale_dtype"] = self.scale_dtype
dtype = config["data_type"]

# Determine quantization function key with symmetry/asymmetry
if dtype not in QUANT_FUNC_WITH_DTYPE:
dtype = f"{dtype}_{'sym' if config['sym'] else 'asym'}"

# Optionally use optimized rounding (RTN) variant
if not self.disable_opt_rtn and f"rtn_{dtype}" in QUANT_FUNC_WITH_DTYPE:
dtype = f"rtn_{dtype}"

quant_func = QUANT_FUNC_WITH_DTYPE[dtype]

# Attempt quantization on GPU, fall back to CPU if OOM
try:
weight, scale, zp = quant_func(
module.weight.to(self.device),
**{k: config[k] for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]},
)
except RuntimeError as e:
cuda_error_msg = traceback.format_exc()
try:
logger.error(cuda_error_msg)
logger.warning("falling back to CPU")
weight, scale, zp = quant_func(
module.weight.to("cpu"),
**{
k: config[k]
for k in ["bits", "group_size", "super_bits", "super_group_size", "scale_dtype"]
},
)
except Exception as e:
raise

# Overwrite the module's weights with the quantized version
module.weight.data.copy_(weight.cpu())

# Attach scale and zero point (zp) to the module
for param_name, value in zip(["scale", "zp"], [scale, zp]):
if isinstance(value, dict):
for k, v in value.items():
setattr(module, k if k == "scale" else f"w_{k}", v.cpu())
elif isinstance(value, torch.Tensor):
setattr(module, param_name, value.cpu())
else:
setattr(module, param_name, value)

# Update config
self.layer_config.setdefault(name, {}).update(config)

# Release memory
clear_memory()

return is_quantized
19 changes: 19 additions & 0 deletions auto_round/quantizers/data_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from auto_round.quantizers.base import BaseQuantizer


class GGUFQuantizer(BaseQuantizer):
pass
16 changes: 16 additions & 0 deletions auto_round/quantizers/mode/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from auto_round.quantizers.mode.rtn import RTNQuantizer
from auto_round.quantizers.mode.tuning import TuningQuantizer
Loading
Loading