forked from oobabooga/textgen
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_generation.py
More file actions
386 lines (299 loc) · 14.1 KB
/
text_generation.py
File metadata and controls
386 lines (299 loc) · 14.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
import ast
import random
import re
import time
import traceback
import numpy as np
import torch
import transformers
import modules.shared as shared
from modules.callbacks import (Iteratorize, Stream,
_SentinelTokenStoppingCriteria)
from modules.extensions import apply_extensions
from modules.html_generator import generate_4chan_html, generate_basic_html
from modules.logging_colors import logger
from modules.models import clear_torch_cache, local_rank
def generate_reply(*args, **kwargs):
shared.generation_lock.acquire()
try:
for result in _generate_reply(*args, **kwargs):
yield result
finally:
shared.generation_lock.release()
def get_max_prompt_length(state):
return state['truncation_length'] - state['max_new_tokens']
def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
if shared.model_type in ['rwkv', 'llamacpp']:
input_ids = shared.tokenizer.encode(str(prompt))
input_ids = np.array(input_ids).reshape(1, len(input_ids))
return input_ids
else:
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
# This is a hack for making replies more creative.
if not add_bos_token and input_ids[0][0] == shared.tokenizer.bos_token_id:
input_ids = input_ids[:, 1:]
# Llama adds this extra token when the first character is '\n', and this
# compromises the stopping criteria, so we just remove it
if type(shared.tokenizer) is transformers.LlamaTokenizer and input_ids[0][0] == 29871:
input_ids = input_ids[:, 1:]
# Handling truncation
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]
if shared.model_type in ['rwkv', 'llamacpp'] or shared.args.cpu:
return input_ids
elif shared.args.flexgen:
return input_ids.numpy()
elif shared.args.deepspeed:
return input_ids.to(device=local_rank)
elif torch.has_mps:
device = torch.device('mps')
return input_ids.to(device)
else:
return input_ids.cuda()
def get_encoded_length(prompt):
length_after_extensions = apply_extensions('tokenized_length', prompt)
if length_after_extensions is not None:
return length_after_extensions
return len(encode(prompt)[0])
def decode(output_ids, skip_special_tokens=True):
return shared.tokenizer.decode(output_ids, skip_special_tokens)
# Removes empty replies from gpt4chan outputs
def fix_gpt4chan(s):
for i in range(10):
s = re.sub("--- [0-9]*\n>>[0-9]*\n---", "---", s)
s = re.sub("--- [0-9]*\n *\n---", "---", s)
s = re.sub("--- [0-9]*\n\n\n---", "---", s)
return s
# Fix the LaTeX equations in galactica
def fix_galactica(s):
s = s.replace(r'\[', r'$')
s = s.replace(r'\]', r'$')
s = s.replace(r'\(', r'$')
s = s.replace(r'\)', r'$')
s = s.replace(r'$$', r'$')
s = re.sub(r'\n', r'\n\n', s)
s = re.sub(r"\n{3,}", "\n\n", s)
return s
def get_reply_from_output_ids(output_ids, input_ids, original_question, state, is_chat=False):
if shared.model_type == 'HF_seq2seq':
reply = decode(output_ids, state['skip_special_tokens'])
else:
new_tokens = len(output_ids) - len(input_ids[0])
reply = decode(output_ids[-new_tokens:], state['skip_special_tokens'])
# Prevent LlamaTokenizer from skipping a space
if type(shared.tokenizer) is transformers.LlamaTokenizer and len(output_ids) > 0:
if shared.tokenizer.convert_ids_to_tokens(int(output_ids[-new_tokens])).startswith('▁'):
reply = ' ' + reply
if not is_chat:
reply = apply_extensions('output', reply)
return reply
def formatted_outputs(reply, model_name):
if shared.model_type == 'gpt4chan':
reply = fix_gpt4chan(reply)
return reply, generate_4chan_html(reply)
else:
return reply, generate_basic_html(reply)
def set_manual_seed(seed):
seed = int(seed)
if seed == -1:
seed = random.randint(1, 2**31)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
return seed
def stop_everything_event():
shared.stop_everything = True
def generate_reply_wrapper(question, state, eos_token=None, stopping_strings=None):
for reply in generate_reply(question, state, eos_token, stopping_strings, is_chat=False):
if shared.model_type not in ['HF_seq2seq']:
reply = question + reply
yield formatted_outputs(reply, shared.model_name)
def _generate_reply(question, state, eos_token=None, stopping_strings=None, is_chat=False):
state = apply_extensions('state', state)
generate_func = apply_extensions('custom_generate_reply')
if generate_func is None:
if shared.model_name == 'None' or shared.model is None:
logger.error("No model is loaded! Select one in the Model tab.")
yield ''
return
if shared.model_type in ['rwkv', 'llamacpp']:
generate_func = generate_reply_custom
elif shared.args.flexgen:
generate_func = generate_reply_flexgen
else:
generate_func = generate_reply_HF
# Preparing the input
original_question = question
if not is_chat:
question = apply_extensions('input', question)
if shared.args.verbose:
print(f'\n\n{question}\n--------------------\n')
shared.stop_everything = False
clear_torch_cache()
seed = set_manual_seed(state['seed'])
is_stream = state['stream']
last_update = -1
reply = ''
for reply in generate_func(question, original_question, seed, state, eos_token, stopping_strings, is_chat=is_chat):
if is_stream:
cur_time = time.time()
if cur_time - last_update > 0.041666666666666664: # Limit streaming to 24 fps
last_update = cur_time
yield reply
else:
yield reply
if is_stream:
yield reply
def generate_reply_HF(question, original_question, seed, state, eos_token=None, stopping_strings=None, is_chat=False):
generate_params = {}
for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a']:
generate_params[k] = state[k]
for k in ['epsilon_cutoff', 'eta_cutoff']:
if state[k] > 0:
generate_params[k] = state[k] * 1e-4
if state['ban_eos_token']:
generate_params['suppress_tokens'] = [shared.tokenizer.eos_token_id]
if shared.args.no_cache:
generate_params.update({'use_cache': False})
if shared.args.deepspeed:
generate_params.update({'synced_gpus': True})
# Encode the input
input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
output = input_ids[0]
cuda = not any((shared.args.cpu, shared.args.deepspeed))
# Find the eos tokens
eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
if eos_token is not None:
eos_token_ids.append(int(encode(eos_token)[0][-1]))
# Add the encoded tokens to generate_params
question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
original_input_ids = input_ids
generate_params.update({'inputs': input_ids})
if inputs_embeds is not None:
generate_params.update({'inputs_embeds': inputs_embeds})
# Create the StoppingCriteriaList with the stopping strings (needs to be done after tokenizer extensions)
stopping_criteria_list = transformers.StoppingCriteriaList()
for st in (stopping_strings, ast.literal_eval(f"[{state['custom_stopping_strings']}]")):
if type(st) is list and len(st) > 0:
sentinel_token_ids = [encode(string, add_special_tokens=False) for string in st]
stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=sentinel_token_ids, starting_idx=len(input_ids[0])))
break
# Update generate_params with the eos token and the stopping strings
generate_params['eos_token_id'] = eos_token_ids
generate_params['stopping_criteria'] = stopping_criteria_list
t0 = time.time()
try:
if not is_chat and shared.model_type != 'HF_seq2seq':
yield ''
# Generate the entire reply at once.
if not state['stream']:
with torch.no_grad():
output = shared.model.generate(**generate_params)[0]
if cuda:
output = output.cuda()
yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
# Stream the reply 1 token at a time.
# This is based on the trick of using 'stopping_criteria' to create an iterator.
else:
def generate_with_callback(callback=None, **kwargs):
kwargs['stopping_criteria'].append(Stream(callback_func=callback))
clear_torch_cache()
with torch.no_grad():
shared.model.generate(**kwargs)
def generate_with_streaming(**kwargs):
return Iteratorize(generate_with_callback, kwargs, callback=None)
with generate_with_streaming(**generate_params) as generator:
for output in generator:
yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
if output[-1] in eos_token_ids:
break
except Exception:
traceback.print_exc()
finally:
t1 = time.time()
original_tokens = len(original_input_ids[0])
new_tokens = len(output) - (original_tokens if shared.model_type != 'HF_seq2seq' else 0)
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
return
def generate_reply_custom(question, original_question, seed, state, eos_token=None, stopping_strings=None, is_chat=False):
seed = set_manual_seed(state['seed'])
generate_params = {'token_count': state['max_new_tokens']}
for k in ['temperature', 'top_p', 'top_k', 'repetition_penalty']:
generate_params[k] = state[k]
if shared.model_type == 'llamacpp':
for k in ['mirostat_mode', 'mirostat_tau', 'mirostat_eta']:
generate_params[k] = state[k]
t0 = time.time()
reply = ''
try:
if not is_chat:
yield ''
if not state['stream']:
reply = shared.model.generate(context=question, **generate_params)
if not is_chat:
reply = apply_extensions('output', reply)
yield reply
else:
for reply in shared.model.generate_with_streaming(context=question, **generate_params):
if not is_chat:
reply = apply_extensions('output', reply)
yield reply
except Exception:
traceback.print_exc()
finally:
t1 = time.time()
original_tokens = len(encode(original_question)[0])
new_tokens = len(encode(original_question + reply)[0]) - original_tokens
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
return
def generate_reply_flexgen(question, original_question, seed, state, eos_token=None, stopping_strings=None, is_chat=False):
generate_params = {}
for k in ['max_new_tokens', 'do_sample', 'temperature']:
generate_params[k] = state[k]
if state['stream']:
generate_params['max_new_tokens'] = 8
# Encode the input
input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
output = input_ids[0]
# Find the eos tokens
eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
if eos_token is not None:
eos_token_ids.append(int(encode(eos_token)[0][-1]))
# Add the encoded tokens to generate_params
question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
original_input_ids = input_ids
generate_params.update({'inputs': input_ids})
if inputs_embeds is not None:
generate_params.update({'inputs_embeds': inputs_embeds})
# Update generate_params with the eos token and the stopping strings
generate_params['stop'] = eos_token_ids[-1]
t0 = time.time()
try:
if not is_chat:
yield ''
# Generate the entire reply at once.
if not state['stream']:
with torch.no_grad():
output = shared.model.generate(**generate_params)[0]
yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
# Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
else:
for i in range(state['max_new_tokens'] // 8 + 1):
if shared.stop_everything:
break
clear_torch_cache()
with torch.no_grad():
output = shared.model.generate(**generate_params)[0]
if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
break
yield get_reply_from_output_ids(output, original_input_ids, original_question, state)
input_ids = np.reshape(output, (1, output.shape[0]))
generate_params.update({'inputs': input_ids})
except Exception:
traceback.print_exc()
finally:
t1 = time.time()
original_tokens = len(original_input_ids[0])
new_tokens = len(output) - (original_tokens if shared.model_type != 'HF_seq2seq' else 0)
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
return