1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
| def preprocess_fn(examples): """把 prompt/response 拼成对话模板,tokenize,并 mask 掉 prompt 部分""" input_ids, attention_mask, labels = [], [], []
for prompt, response in zip(examples["prompt"], examples["response"]): messages = [ {"role": "user", "content": prompt}, {"role": "assistant", "content": response}, ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) tok = tokenizer( text, truncation=True, padding="max_length", max_length=2048, ) ids = tok["input_ids"] mask = tok["attention_mask"]
sep_id = tokenizer.convert_tokens_to_ids("</think>") try: sep_index = ids.index(sep_id) + 1 except ValueError: sep_index = 0
lbl = ids.copy() lbl[:sep_index] = [-100] * sep_index
input_ids.append(ids) attention_mask.append(mask) labels.append(lbl)
return { "input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, } small_train = ds["train"].select(range(10000))
tokenized_train = small_train.map( preprocess_fn, batched=True, num_proc=4, remove_columns=["id", "prompt", "response"], )
|