From 6c42b53a408a66234c8dc6ade4d60c5190290c2d Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Tue, 27 Feb 2024 14:02:00 +0800 Subject: [PATCH] WOQ support autoround algo on cpu device (#1312) * woq support autoround algo on cpu device Signed-off-by: changwangss --------- Signed-off-by: changwangss Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../quantization/requirements.txt | 3 ++ .../quantization/run_generation.py | 34 +++++++++++++- .../quantization/requirements.txt | 1 + .../quantization/run_generation.py | 34 +++++++++++++- .../llm/quantization/utils.py | 26 ++++++++--- tests/CI/test_quantization.py | 44 +++++++++++++++---- tests/requirements.txt | 1 + 7 files changed, 127 insertions(+), 16 deletions(-) diff --git a/examples/huggingface/pytorch/code-generation/quantization/requirements.txt b/examples/huggingface/pytorch/code-generation/quantization/requirements.txt index 4c1e8f73720..ee256703140 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/requirements.txt +++ b/examples/huggingface/pytorch/code-generation/quantization/requirements.txt @@ -12,4 +12,7 @@ neural-compressor intel_extension_for_pytorch==2.2.0 optimum-intel git+https://github.com/bigcode-project/bigcode-evaluation-harness@00967d12093ef614de7bdad0772aed8e4118f1fd +git+https://github.com/intel/auto-round.git@a868c805de4be271cfe7403309a64d9bf03a0ecf + + diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py index 5cb2e2a483d..715d99fb421 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py @@ -68,7 +68,7 @@ parser.add_argument( "--woq_algo", default="RTN", - choices=["RTN", "AWQ", "TEQ", "GPTQ"], + choices=["RTN", "AWQ", "TEQ", "GPTQ", "AUTOROUND"], help="Weight-only parameter.", ) parser.add_argument( @@ -133,6 +133,18 @@ help="Calibration dataset sequence max length, this should align with your model config", ) parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization') +# ============AUTOROUND configs============== +parser.add_argument( + "--autoround_nsamples", + type=int, default=128, + help="Number of calibration data samples.", +) +parser.add_argument( + "--autoround_seq_len", + type=int, + default=2048, + help="Calibration dataset sequence max length, this should align with your model config", +) # ============Harness configs============ parser.add_argument("--tasks", default=None, help="Evaluation tasks") parser.add_argument( @@ -281,6 +293,26 @@ algorithm_args=algorithm_args, calib_dataset=args.dataset ) + elif args.woq_algo == "AUTOROUND": + algorithm_args = { + "n_samples": args.autoround_nsamples, + "amp": False, + "seq_len": args.autoround_seq_len, + "iters": args.calib_iters, + "scale_dtype": "fp32", + "device": "cpu", + } + quantization_config = WeightOnlyQuantConfig( + compute_dtype=args.woq_compute_dtype, + scale_dtype=args.woq_scale_dtype, + weight_dtype=args.woq_weight_dtype, + scheme=args.woq_scheme, + group_size=args.woq_group_size, + algorithm=args.woq_algo, + tokenizer=tokenizer, + algorithm_args=algorithm_args, + calib_dataset=args.dataset + ) else: quantization_config = WeightOnlyQuantConfig( weight_dtype=args.woq_weight_dtype, diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt index 04aae9eca82..0f41926696c 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt +++ b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt @@ -14,3 +14,4 @@ tiktoken #qwen einops #qwen git+https://github.com/intel/neural-compressor.git git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2 +git+https://github.com/intel/auto-round.git@a868c805de4be271cfe7403309a64d9bf03a0ecf diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py index 3329497b039..8333d113a8a 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py @@ -95,7 +95,7 @@ parser.add_argument( "--woq_algo", default="RTN", - choices=["RTN", "AWQ", "TEQ", "GPTQ"], + choices=["RTN", "AWQ", "TEQ", "GPTQ", "AUTOROUND"], help="Weight-only parameter.", ) parser.add_argument( @@ -159,6 +159,18 @@ help="Calibration dataset sequence max length, this should align with your model config", ) parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization') +# ============AUTOROUND configs============== +parser.add_argument( + "--autoround_nsamples", + type=int, default=128, + help="Number of calibration data samples.", +) +parser.add_argument( + "--autoround_seq_len", + type=int, + default=2048, + help="Calibration dataset sequence max length, this should align with your model config", +) # ============BitsAndBytes configs============== parser.add_argument("--bitsandbytes", action="store_true") # ============AutoModel parameters============== @@ -292,6 +304,26 @@ tokenizer=tokenizer, algorithm_args=algorithm_args, ) + elif args.woq_algo == "AUTOROUND": + algorithm_args = { + "n_samples": args.autoround_nsamples, + "amp": False, + "seq_len": args.autoround_seq_len, + "iters": args.calib_iters, + "scale_dtype": "fp32", + "device": "cpu", + } + quantization_config = WeightOnlyQuantConfig( + compute_dtype=args.woq_compute_dtype, + scale_dtype=args.woq_scale_dtype, + weight_dtype=args.woq_weight_dtype, + scheme=args.woq_scheme, + group_size=args.woq_group_size, + algorithm=args.woq_algo, + tokenizer=tokenizer, + algorithm_args=algorithm_args, + calib_dataset=args.dataset + ) else: quantization_config = WeightOnlyQuantConfig( compute_dtype=args.woq_compute_dtype, diff --git a/intel_extension_for_transformers/llm/quantization/utils.py b/intel_extension_for_transformers/llm/quantization/utils.py index 4a4d9f5fd0f..f839a9a1f49 100644 --- a/intel_extension_for_transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/llm/quantization/utils.py @@ -173,7 +173,7 @@ def _replace_linear( model._modules[name].requires_grad_(False) if device == "cpu" or device == torch.device("cpu") or device == "auto": if not empty_weights: - if quantization_config.algorithm == "GPTQ": + if quantization_config.algorithm == "GPTQ" or quantization_config.algorithm == "AUTOROUND": from .gptq_utils import unpack_weight int_weight, gptq_scales, gptq_zeros = unpack_weight( module.qweight, @@ -237,7 +237,7 @@ def convert_to_quantized_model(model, config, device="cpu"): calib_func = config.calib_func calib_iters = config.calib_iters model_device = next(model.parameters()).device - if calib_dataloader is None and config.algorithm in ["TEQ", "AWQ", "GPTQ"]: + if calib_dataloader is None and config.algorithm in ["TEQ", "AWQ", "GPTQ", "AUTOROUND"]: from datasets import load_dataset from torch.utils.data import DataLoader @@ -320,7 +320,8 @@ def default_calib_func(model): }, "awq_args": config.algorithm_args.update({"enable_mse_search": config.mse_range}) if config.algorithm == "AWQ" and config.algorithm_args is not None else {}, - "gptq_args": config.algorithm_args if config.algorithm == "GPTQ" else None + "gptq_args": config.algorithm_args if config.algorithm == "GPTQ" else None, + "autoround_args": config.algorithm_args if config.algorithm == "AUTOROUND" else None } conf = PostTrainingQuantConfig( approach="weight_only", @@ -346,7 +347,7 @@ def default_calib_func(model): ) # TEQ: set calib_func=None, use default training func as calib_func # RTN: doesn't need calib_func - if config.algorithm in ["TEQ", "RTN", "GPTQ"]: + if config.algorithm in ["TEQ", "RTN", "GPTQ", "AUTOROUND"]: calib_func = None orig_dtype = torch.float32 @@ -360,6 +361,7 @@ def default_calib_func(model): conf, calib_func=calib_func, calib_dataloader=calib_dataloader) + if device == "xpu" or device == torch.device("xpu"): model = inc_model.export_compressed_model(compression_dtype=torch.int8, compression_dim=0, @@ -374,7 +376,6 @@ def default_calib_func(model): if config.algorithm == "GPTQ": inc_model = inc_model.export_compressed_model(use_optimum_format=True) inc_model.eval() - quantize_config = { "bits": bits, "group_size": config.group_size, @@ -386,6 +387,21 @@ def default_calib_func(model): "model_file_base_name": "model", } + setattr(config, "gptq_quantize_config", quantize_config) + q_model = replace_linear(inc_model, None, None, config, device=device) + elif config.algorithm == "AUTOROUND": + inc_model = inc_model.export_compressed_model(use_optimum_format=True) + inc_model.eval() + quantize_config = { + "bits": bits, + "group_size": config.group_size, + "desc_act": False, + "sym": True if config.scheme == "sym" else False, + "true_sequential": True, + "model_name_or_path": "null", + "model_file_base_name": "model", + } + setattr(config, "gptq_quantize_config", quantize_config) q_model = replace_linear(inc_model, None, None, config, device=device) else: diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py index 7aef9f17351..53f1a4d25ce 100644 --- a/tests/CI/test_quantization.py +++ b/tests/CI/test_quantization.py @@ -321,7 +321,7 @@ def test_quantization_for_llm(self): from intel_extension_for_transformers.transformers import AutoModelForCausalLM fp32_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, use_neural_speed=False) dummy_input = fp32_model.dummy_inputs["input_ids"] - #smooth-quant + # SQ sq_config = SmoothQuantConfig( tokenizer=tokenizer, # either two of one, tokenizer or calib_func calib_iters=2, @@ -332,7 +332,8 @@ def test_quantization_for_llm(self): use_neural_speed=False ) self.assertTrue(isinstance(q_model.model, torch.jit.ScriptModule)) - #SQ auto + + # SQ auto recipes = { "smooth_quant": True, "smooth_quant_args": { "alpha": "auto", "auto_alpha_args":{"alpha_max": 0.6, @@ -349,8 +350,9 @@ def test_quantization_for_llm(self): use_neural_speed=False ) self.assertTrue(isinstance(q_model.model, torch.jit.ScriptModule)) + # weight-only - #RTN + # RTN woq_config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange") woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config, @@ -358,9 +360,9 @@ def test_quantization_for_llm(self): ) woq_model.eval() output = woq_model(dummy_input) - print("output:", float(output[0][0][0][0])) self.assertTrue(isclose(float(output[0][0][0][0]), 0.16387596726417542, rel_tol=1e-04)) - #AWQ + + # AWQ woq_config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange", calib_iters=5, tokenizer=tokenizer, @@ -373,7 +375,8 @@ def test_quantization_for_llm(self): output = woq_model(dummy_input) print("output:", float(output[0][0][0][0])) self.assertTrue(isclose(float(output[0][0][0][0]), 0.17239853739738464, rel_tol=1e-04)) - #TEQ + + # TEQ woq_config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange", calib_iters=5, tokenizer=tokenizer, @@ -384,7 +387,8 @@ def test_quantization_for_llm(self): ) woq_model.eval() output = woq_model(dummy_input) - #fp8 + + # fp8 woq_config = WeightOnlyQuantConfig(weight_dtype="fp8_e5m2", scale_dtype="fp8_e8m0") woq_model = AutoModelForCausalLM.from_pretrained( model_name_or_path, quantization_config=woq_config, use_neural_speed=False @@ -394,6 +398,7 @@ def test_quantization_for_llm(self): self.assertTrue( isclose(float(output[0][0][0][0]), 0.16162332892417908, rel_tol=1e-04) ) + # amp amp_config = MixedPrecisionConfig() amp_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, @@ -403,6 +408,7 @@ def test_quantization_for_llm(self): amp_model.eval() output = amp_model(dummy_input) self.assertTrue(isclose(float(output[0][0][0][0]), 0.1689453125, rel_tol=1e-04)) + # bitsandbytes, for cpu is fp32 model bab_config = BitsAndBytesConfig() bab_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, @@ -430,7 +436,7 @@ def test_quantization_for_llm(self): print("output:", float(output[0][0][0][0])) self.assertTrue(isclose(float(output[0][0][0][0]), 0.1675747185945511, rel_tol=1e-04)) - #GPTQ + # GPTQ algorithm_args = { "act_order": False, "percdamp": 0.01, @@ -449,9 +455,29 @@ def test_quantization_for_llm(self): ) woq_model.eval() output = woq_model(dummy_input) - print("output:", float(output[0][0][0][0])) self.assertTrue(isclose(float(output[0][0][0][0]), 0.17126554250717163, rel_tol=1e-04)) + # AUTOROUND + algorithm_args = { + "n_samples": 128, + "amp": False, + "seq_len": 32, + "iters": 5, + "scale_dtype": "fp32", + "device": "cpu", + } + woq_config = WeightOnlyQuantConfig(weight_dtype="int4_clip", + algorithm_args=algorithm_args, + tokenizer=tokenizer, + algorithm="AUTOROUND") + woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=woq_config, + use_neural_speed=False + ) + woq_model.eval() + output = woq_model(dummy_input) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.18015708029270172, rel_tol=1e-04)) + def test_export(self): # test model with model_id self.trainer.export_to_onnx("export.onnx") diff --git a/tests/requirements.txt b/tests/requirements.txt index 898ecd8e79c..6ab01ac719e 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -5,6 +5,7 @@ datasets==2.14.7 einops evaluate gguf +git+https://github.com/intel/auto-round.git@a868c805de4be271cfe7403309a64d9bf03a0ecf git+https://github.com/intel/neural-compressor.git intel-extension-for-pytorch==2.2.0 intel-tensorflow==2.14.0