forked from oobabooga/textgen
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGPTQ_loader.py
More file actions
201 lines (162 loc) · 7.4 KB
/
GPTQ_loader.py
File metadata and controls
201 lines (162 loc) · 7.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import inspect
import re
import sys
from pathlib import Path
import accelerate
import torch
import transformers
from transformers import AutoConfig, AutoModelForCausalLM
import modules.shared as shared
from modules.logging_colors import logger
sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
try:
import llama_inference_offload
except ImportError:
logger.error('Failed to load GPTQ-for-LLaMa')
logger.error('See https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md')
sys.exit(-1)
try:
from modelutils import find_layers
except ImportError:
from utils import find_layers
try:
from quant import make_quant
is_triton = False
except ImportError:
import quant
is_triton = True
# This function is a replacement for the load_quant function in the
# GPTQ-for_LLaMa repository. It supports more models and branches.
def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=None, kernel_switch_threshold=128, eval=True):
exclude_layers = exclude_layers or ['lm_head']
def noop(*args, **kwargs):
pass
config = AutoConfig.from_pretrained(model, trust_remote_code=shared.args.trust_remote_code)
torch.nn.init.kaiming_uniform_ = noop
torch.nn.init.uniform_ = noop
torch.nn.init.normal_ = noop
torch.set_default_dtype(torch.half)
transformers.modeling_utils._init_weights = False
torch.set_default_dtype(torch.half)
model = AutoModelForCausalLM.from_config(config, trust_remote_code=shared.args.trust_remote_code)
torch.set_default_dtype(torch.float)
if eval:
model = model.eval()
layers = find_layers(model)
for name in exclude_layers:
if name in layers:
del layers[name]
if not is_triton:
gptq_args = inspect.getfullargspec(make_quant).args
make_quant_kwargs = {
'module': model,
'names': layers,
'bits': wbits,
}
if 'groupsize' in gptq_args:
make_quant_kwargs['groupsize'] = groupsize
if 'faster' in gptq_args:
make_quant_kwargs['faster'] = faster_kernel
if 'kernel_switch_threshold' in gptq_args:
make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
make_quant(**make_quant_kwargs)
else:
quant.make_quant_linear(model, layers, wbits, groupsize)
del layers
if checkpoint.endswith('.safetensors'):
from safetensors.torch import load_file as safe_load
model.load_state_dict(safe_load(checkpoint), strict=False)
else:
model.load_state_dict(torch.load(checkpoint), strict=False)
if is_triton:
if shared.args.quant_attn:
quant.make_quant_attn(model)
if eval and shared.args.fused_mlp:
quant.make_fused_mlp(model)
if shared.args.warmup_autotune:
quant.autotune_warmup_linear(model, transpose=not eval)
if eval and shared.args.fused_mlp:
quant.autotune_warmup_fused(model)
model.seqlen = 2048
return model
# Used to locate the .pt/.safetensors quantized file
def find_quantized_model_file(model_name):
if shared.args.checkpoint:
return Path(shared.args.checkpoint)
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
pt_path = None
priority_name_list = [
Path(f'{shared.args.model_dir}/{model_name}{hyphen}{shared.args.wbits}bit{group}{ext}')
for group in ([f'-{shared.args.groupsize}g', ''] if shared.args.groupsize > 0 else [''])
for ext in ['.safetensors', '.pt']
for hyphen in ['-', f'/{model_name}-', '/']
]
for path in priority_name_list:
if path.exists():
pt_path = path
break
# If the model hasn't been found with a well-behaved name, pick the last .pt
# or the last .safetensors found in its folder as a last resort
if not pt_path:
for ext in ['.pt', '.safetensors']:
found = list(path_to_model.glob(f"*{ext}"))
if len(found) > 0:
if len(found) > 1:
logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
pt_path = found[-1]
break
return pt_path
# The function that loads the model in modules/models.py
def load_quantized(model_name):
if shared.args.model_type is None:
logger.error("The model could not be loaded because its type could not be inferred from its name.")
logger.error("Please specify the type manually using the --model_type argument.")
return None
# Select the appropriate load_quant function
model_type = shared.args.model_type.lower()
if shared.args.pre_layer and model_type == 'llama':
load_quant = llama_inference_offload.load_quant
elif model_type in ('llama', 'opt', 'gptj'):
if shared.args.pre_layer:
logger.warning("Ignoring --pre_layer because it only works for llama model type.")
load_quant = _load_quant
else:
logger.error("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported")
exit()
# Find the quantized model weights file (.pt/.safetensors)
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
pt_path = find_quantized_model_file(model_name)
if not pt_path:
logger.error("Could not find the quantized model in .pt or .safetensors format, exiting...")
exit()
else:
logger.info(f"Found the following quantized model: {pt_path}")
# qwopqwop200's offload
if model_type == 'llama' and shared.args.pre_layer:
if len(shared.args.pre_layer) == 1:
pre_layer = shared.args.pre_layer[0]
else:
pre_layer = shared.args.pre_layer
model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, pre_layer)
else:
threshold = False if model_type == 'gptj' else 128
model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)
# accelerate offload (doesn't work properly)
if shared.args.gpu_memory or torch.cuda.device_count() > 1:
if shared.args.gpu_memory:
memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
max_memory = {}
for i in range(len(memory_map)):
max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
else:
max_memory = accelerate.utils.get_balanced_memory(model)
device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
logger.info("Using the following device map for the quantized model:", device_map)
# https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model
model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)
# No offload
elif not shared.args.cpu:
model = model.to(torch.device('cuda:0'))
return model