Skip to content

Commit 92c9423

Browse files
authored
Support true sequential options in gptq (#1656)
Signed-off-by: YIYANGCAI <yiyang.cai@intel.com>
1 parent d640297 commit 92c9423

File tree

6 files changed

+260
-102
lines changed

6 files changed

+260
-102
lines changed

docs/source/quantization_weight_only.md

+2
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ Notes:
8787
| use_max_length | False | Whether to align all calibration data to fixed length, which equals to pad_max_length. |
8888
| block_size | 128 | Execute GPTQ quantization per block, block shape = [$C_{out}$, block_size] |
8989
| static_groups | False | Whether to calculate group wise quantization parameters in advance. This option mitigate actorder's extra computational requirements |
90+
| true_sequential | False | Whether to quantize layers within a transformer block in their original order. This can lead to higher accuracy but slower overall quantization process. |
91+
| lm_head | False | Whether to quantize the lm_head (linear layer related to prediction in the end of the language models). |
9092

9193
**Note:** Neural compressor provides `Unsigned integer for asymmetric quantization` and `Signed integer for symmetric quantization`. Please follow the below section to compress the low bit data type for saving.
9294

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@
7777
this should align with your model config, \
7878
and your dataset builder args: args.pad_max_length')
7979
parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization')
80+
parser.add_argument('--gptq_true_sequential', action='store_true', help="Whether to run in true_sequential model.")
81+
parser.add_argument('--gptq_lm_head', action='store_true', help="Whether to use GPTQ to quantize the output layer of the LLMs.")
8082
# ==============code generation args===========
8183
parser.add_argument("--code_generation", action="store_true")
8284
parser.add_argument("--n_samples", default=200, type=int)
@@ -278,7 +280,8 @@ def calib_func(prepared_model):
278280
'use_max_length': args.gptq_use_max_length,
279281
'pad_max_length': args.gptq_pad_max_length,
280282
'static_groups': args.gptq_static_groups,
281-
"enable_mse_search": args.woq_enable_mse_search,
283+
"true_sequential": args.gptq_true_sequential,
284+
"lm_head": args.gptq_lm_head,
282285
}
283286
# GPTQ: use assistive functions to modify calib_dataloader and calib_func
284287
# TEQ: set calib_func=None, use default training func as calib_func

neural_compressor/adaptor/pytorch.py

+2
Original file line numberDiff line numberDiff line change
@@ -4722,6 +4722,8 @@ def gptq_quantize(self, model, tune_cfg, dataloader):
47224722
"act_order": self.recipes["gptq_args"].get("act_order", False),
47234723
"block_size": self.recipes["gptq_args"].get("block_size", True),
47244724
"static_groups": self.recipes["gptq_args"].get("static_groups", False),
4725+
"true_sequential": self.recipes["gptq_args"].get("true_sequential", False),
4726+
"lm_head": self.recipes["gptq_args"].get("lm_head", False),
47254727
}
47264728
nsamples = self.recipes["gptq_args"].get("nsamples", 128)
47274729
use_max_length = self.recipes["gptq_args"].get("use_max_length", False)

0 commit comments

Comments
 (0)