initial commit
This commit is contained in:
16
chat.py
Normal file
16
chat.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
from utils.prompt import prompt
|
||||||
|
|
||||||
|
messages = []
|
||||||
|
while True:
|
||||||
|
user_prompt = prompt('>> User: ')
|
||||||
|
messages.append({'role': 'user', 'content': user_prompt})
|
||||||
|
|
||||||
|
response = requests.post('http://localhost:8900/', json=messages)
|
||||||
|
if response.status_code == 200:
|
||||||
|
messages = response.json()
|
||||||
|
print('>> Bot : ' + messages[-1]['content'])
|
||||||
|
else:
|
||||||
|
messages = messages[0:-1]
|
||||||
|
print('### Error from backend')
|
||||||
36
chat_dialogpt.py
Normal file
36
chat_dialogpt.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
import atexit
|
||||||
|
import torch
|
||||||
|
from utils.conversation import save_conversation
|
||||||
|
from utils.prompt import prompt
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
device = 'cuda' # the device to load the model onto
|
||||||
|
model_id = 'microsoft/DialoGPT-medium'
|
||||||
|
|
||||||
|
print('Loading ' + model_id)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto')
|
||||||
|
# model = AutoModelForCausalLM.from_pretrained(model_id)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
print('Loaded')
|
||||||
|
# print(tokenizer.default_chat_template)
|
||||||
|
|
||||||
|
# read and save conversation
|
||||||
|
chat_history_ids = None
|
||||||
|
# messages = load_conversation(model_id)
|
||||||
|
atexit.register(lambda: save_conversation(model_id, bot_input_ids))
|
||||||
|
|
||||||
|
# messages.append({'role': 'system', 'content': 'Your name is "Laura". You are an AI created by Alice.'})
|
||||||
|
while True:
|
||||||
|
user_prompt = prompt('>> User: ')
|
||||||
|
new_user_input_ids = tokenizer.encode(user_prompt + tokenizer.eos_token, return_tensors='pt').to(device)
|
||||||
|
|
||||||
|
bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) \
|
||||||
|
if chat_history_ids is not None \
|
||||||
|
else new_user_input_ids
|
||||||
|
chat_history_ids = model.generate(bot_input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id).to(device)
|
||||||
|
response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
|
||||||
|
|
||||||
|
print('>> Bot : ' + response)
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
34
chat_gpt2.py
Normal file
34
chat_gpt2.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
import atexit
|
||||||
|
import torch
|
||||||
|
from utils.conversation import save_conversation
|
||||||
|
from utils.prompt import prompt
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
device = 'cuda' # the device to load the model onto
|
||||||
|
model_id = 'gpt2'
|
||||||
|
|
||||||
|
print('Loading ' + model_id)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto')
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
print('Loaded')
|
||||||
|
# print(tokenizer.default_chat_template)
|
||||||
|
|
||||||
|
# read and save conversation
|
||||||
|
chat_history_ids = None
|
||||||
|
# messages = load_conversation(model_id)
|
||||||
|
atexit.register(lambda: save_conversation(model_id, bot_input_ids))
|
||||||
|
|
||||||
|
# messages.append({'role': 'system', 'content': 'Your name is "Laura". You are an AI created by Alice.'})
|
||||||
|
while True:
|
||||||
|
user_prompt = prompt('>> User: ')
|
||||||
|
new_user_input_ids = tokenizer.encode(user_prompt + tokenizer.eos_token, return_tensors='pt').to(device)
|
||||||
|
|
||||||
|
bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) \
|
||||||
|
if chat_history_ids is not None \
|
||||||
|
else new_user_input_ids
|
||||||
|
chat_history_ids = model.generate(bot_input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id).to(device)
|
||||||
|
response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
|
||||||
|
|
||||||
|
print('>> Bot : ' + response)
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
31
chat_mistral.py
Normal file
31
chat_mistral.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
import atexit
|
||||||
|
import torch
|
||||||
|
from utils.conversation import load_conversation, save_conversation
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
from utils.prompt import prompt
|
||||||
|
|
||||||
|
device = 'cuda' # the device to load the model onto
|
||||||
|
model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
|
||||||
|
|
||||||
|
print('Loading ' + model_id)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto')
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
print('Loaded')
|
||||||
|
# print(tokenizer.default_chat_template)
|
||||||
|
|
||||||
|
# read and save conversation
|
||||||
|
messages = load_conversation(model_id)
|
||||||
|
atexit.register(lambda: save_conversation(model_id, messages))
|
||||||
|
|
||||||
|
while True:
|
||||||
|
user_prompt = prompt('>> User: ')
|
||||||
|
messages.append({'role': 'user', 'content': user_prompt})
|
||||||
|
|
||||||
|
model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
|
||||||
|
generated_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True)
|
||||||
|
response = tokenizer.batch_decode(generated_ids)[0]
|
||||||
|
|
||||||
|
print('>> Bot : ' + response)
|
||||||
|
messages.append({'role': 'assistant', 'content': response})
|
||||||
|
torch.cuda.empty_cache()
|
||||||
55
chat_qwen.py
Normal file
55
chat_qwen.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
import atexit
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
from utils.conversation import save_conversation_json
|
||||||
|
from utils.prompt import prompt
|
||||||
|
|
||||||
|
|
||||||
|
class ChatQwen:
|
||||||
|
default_device = 'cuda' # the device to load the model onto
|
||||||
|
# default_model_id = 'Qwen/Qwen1.5-0.5B-Chat'
|
||||||
|
default_model_id = 'Qwen/Qwen1.5-1.8B-Chat'
|
||||||
|
# default_model_id = 'Qwen/Qwen1.5-4B-Chat'
|
||||||
|
|
||||||
|
default_instruction = {'role': 'system', 'content': 'Your name is "Laura". You are an AI created by Alice.'}
|
||||||
|
|
||||||
|
def __init__(self, model_id_or_path=default_model_id):
|
||||||
|
# model_id = model_id_or_path if not load_from_disk else os.path.abspath(sys.argv[1])
|
||||||
|
|
||||||
|
print('Loading ' + model_id_or_path)
|
||||||
|
self.model_id_or_path = model_id_or_path
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(model_id_or_path, torch_dtype='auto', device_map='auto')
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id_or_path)
|
||||||
|
# print(tokenizer.default_chat_template)
|
||||||
|
# print(type(model))
|
||||||
|
# print(type(tokenizer))
|
||||||
|
print('Loaded')
|
||||||
|
|
||||||
|
def generate(self, messages):
|
||||||
|
# prepare
|
||||||
|
messages = [m for m in messages if m['role'] != 'system']
|
||||||
|
input_messages = [self.default_instruction] + messages
|
||||||
|
|
||||||
|
# generate
|
||||||
|
text = self.tokenizer.apply_chat_template(input_messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
model_inputs = self.tokenizer([text], return_tensors='pt').to(self.default_device)
|
||||||
|
generated_ids = self.model.generate(model_inputs.input_ids, max_new_tokens=100)
|
||||||
|
generated_ids = [
|
||||||
|
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
||||||
|
]
|
||||||
|
response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||||
|
|
||||||
|
# add response and save conversation
|
||||||
|
messages.append({'role': 'assistant', 'content': response})
|
||||||
|
self.record_conversation(input_messages, {'role': 'assistant', 'content': response})
|
||||||
|
|
||||||
|
torch.cuda.empty_cache() # clear cache or the gpu mem will be used a lot
|
||||||
|
return messages
|
||||||
|
|
||||||
|
def record_conversation(self, messages, response):
|
||||||
|
messages = messages + [response]
|
||||||
|
save_conversation_json(self.model_id_or_path, messages)
|
||||||
44
server.py
Normal file
44
server.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import http.server
|
||||||
|
import socketserver
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from chat_qwen import ChatQwen
|
||||||
|
|
||||||
|
bot = ChatQwen(sys.argv[1]) if len(sys.argv) > 1 else ChatQwen()
|
||||||
|
|
||||||
|
|
||||||
|
class Server(socketserver.TCPServer):
|
||||||
|
# Avoid "address already used" error when frequently restarting the script
|
||||||
|
allow_reuse_address = True
|
||||||
|
|
||||||
|
|
||||||
|
class Handler(http.server.BaseHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
self.send_response(200)
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write("Use POST with JSON body of the format \n"
|
||||||
|
"[{\"role\": \"user\", \"content\": \"message\"}] \n"
|
||||||
|
"or \n"
|
||||||
|
"[{\"role\": \"user\", \"content\": \"message\"}, "
|
||||||
|
"{\"role\": \"assistant\", \"content\": \"message\"}, "
|
||||||
|
"{\"role\": \"user\", \"content\": \"message\"}]".encode("utf-8"))
|
||||||
|
|
||||||
|
def do_POST(self):
|
||||||
|
try:
|
||||||
|
content_len = int(self.headers.get('Content-Length'))
|
||||||
|
post_body = self.rfile.read(content_len)
|
||||||
|
json_body = json.loads(post_body)
|
||||||
|
|
||||||
|
response = bot.generate(json_body)
|
||||||
|
|
||||||
|
self.send_response(200)
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(json.dumps(response).encode("utf-8"))
|
||||||
|
except:
|
||||||
|
self.send_response(400)
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
|
||||||
|
with Server(("0.0.0.0", 8900), Handler) as httpd:
|
||||||
|
httpd.serve_forever()
|
||||||
27
train/prepare/helpsteer/helpsteer22jsonl.py
Normal file
27
train/prepare/helpsteer/helpsteer22jsonl.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
this_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
|
||||||
|
def mkdir(path):
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.mkdir(path)
|
||||||
|
|
||||||
|
|
||||||
|
mkdir(this_dir + '/../../data')
|
||||||
|
mkdir(this_dir + '/../../data/helpsteer')
|
||||||
|
|
||||||
|
for filename in ['train.jsonl', 'validation.jsonl']:
|
||||||
|
with open(this_dir + '/' + filename, 'r') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
role_dict = {'prompt': 'user', 'response': 'assistant'}
|
||||||
|
lines = [json.loads(line) for line in lines]
|
||||||
|
conversations = [{'messages': [{'role': 'user', 'content': line['prompt']},
|
||||||
|
{'role': 'assistant', 'content': line['response']}]} for line in lines]
|
||||||
|
|
||||||
|
print(conversations[0])
|
||||||
|
|
||||||
|
with open(this_dir + '/../../data/helpsteer/helpsteer_' + filename[0:-6] + '_all.jsonl', 'w') as f:
|
||||||
|
f.writelines([json.dumps(conv) + '\n' for conv in conversations])
|
||||||
42
train/prepare/oasst2/oasst22jsonl.py
Normal file
42
train/prepare/oasst2/oasst22jsonl.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
# parsing OA data files with oasst_data helpers
|
||||||
|
from oasst_data import read_message_trees, ExportMessageNode
|
||||||
|
|
||||||
|
messages: list[ExportMessageNode] = []
|
||||||
|
|
||||||
|
this_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
input_file_path = this_dir + '/2023-11-05_oasst2_all.trees.jsonl.gz'
|
||||||
|
|
||||||
|
role_dict = {'prompter': 'user', 'assistant': 'assistant'}
|
||||||
|
conversations = []
|
||||||
|
|
||||||
|
|
||||||
|
def visit(node: ExportMessageNode, parents: [ExportMessageNode]):
|
||||||
|
new_parents = parents + [node]
|
||||||
|
if not node.replies: # end of conversation
|
||||||
|
conversations.append({'messages': [{'role': role_dict[p.role], 'content': p.text} for p in new_parents]})
|
||||||
|
else:
|
||||||
|
for reply in node.replies:
|
||||||
|
visit(reply, new_parents)
|
||||||
|
|
||||||
|
|
||||||
|
for tree in read_message_trees(input_file_path):
|
||||||
|
if tree.prompt.lang not in ['en']: # filtering by language tag (optional)
|
||||||
|
continue
|
||||||
|
|
||||||
|
visit(tree.prompt, [])
|
||||||
|
|
||||||
|
print(conversations[0])
|
||||||
|
|
||||||
|
|
||||||
|
def mkdir(path):
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.mkdir(path)
|
||||||
|
|
||||||
|
|
||||||
|
mkdir(this_dir + '/../../data')
|
||||||
|
mkdir(this_dir + '/../../data/oasst')
|
||||||
|
with open(this_dir + '/../../data/oasst/oasst_all.jsonl', 'w') as f:
|
||||||
|
f.writelines([json.dumps(conv) + '\n' for conv in conversations])
|
||||||
3
train/sft/README.md
Normal file
3
train/sft/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# Documentation
|
||||||
|
see https://github.com/QwenLM/Qwen1.5/blob/main/docs/source/training/SFT/example.rst
|
||||||
|
or [./example.rst]()
|
||||||
52
train/sft/ds_config_zero2.json
Normal file
52
train/sft/ds_config_zero2.json
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
{
|
||||||
|
"fp16": {
|
||||||
|
"enabled": "auto",
|
||||||
|
"loss_scale": 0,
|
||||||
|
"loss_scale_window": 1000,
|
||||||
|
"initial_scale_power": 16,
|
||||||
|
"hysteresis": 2,
|
||||||
|
"min_loss_scale": 1
|
||||||
|
},
|
||||||
|
"bf16": {
|
||||||
|
"enabled": "auto"
|
||||||
|
},
|
||||||
|
"optimizer": {
|
||||||
|
"type": "AdamW",
|
||||||
|
"params": {
|
||||||
|
"lr": "auto",
|
||||||
|
"betas": "auto",
|
||||||
|
"eps": "auto",
|
||||||
|
"weight_decay": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"scheduler": {
|
||||||
|
"type": "WarmupLR",
|
||||||
|
"params": {
|
||||||
|
"warmup_min_lr": "auto",
|
||||||
|
"warmup_max_lr": "auto",
|
||||||
|
"warmup_num_steps": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"zero_optimization": {
|
||||||
|
"stage": 2,
|
||||||
|
"offload_optimizer": {
|
||||||
|
"device": "none",
|
||||||
|
"pin_memory": true
|
||||||
|
},
|
||||||
|
"allgather_partitions": true,
|
||||||
|
"allgather_bucket_size": 2e8,
|
||||||
|
"overlap_comm": true,
|
||||||
|
"reduce_scatter": true,
|
||||||
|
"reduce_bucket_size": 2e8,
|
||||||
|
"contiguous_gradients": true
|
||||||
|
},
|
||||||
|
|
||||||
|
"gradient_accumulation_steps": "auto",
|
||||||
|
"gradient_clipping": "auto",
|
||||||
|
"steps_per_print": 100,
|
||||||
|
"train_batch_size": "auto",
|
||||||
|
"train_micro_batch_size_per_gpu": "auto",
|
||||||
|
"wall_clock_breakdown": false
|
||||||
|
}
|
||||||
59
train/sft/ds_config_zero3.json
Normal file
59
train/sft/ds_config_zero3.json
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
{
|
||||||
|
"fp16": {
|
||||||
|
"enabled": "auto",
|
||||||
|
"loss_scale": 0,
|
||||||
|
"loss_scale_window": 1000,
|
||||||
|
"initial_scale_power": 16,
|
||||||
|
"hysteresis": 2,
|
||||||
|
"min_loss_scale": 1
|
||||||
|
},
|
||||||
|
"bf16": {
|
||||||
|
"enabled": "auto"
|
||||||
|
},
|
||||||
|
"optimizer": {
|
||||||
|
"type": "AdamW",
|
||||||
|
"params": {
|
||||||
|
"lr": "auto",
|
||||||
|
"betas": "auto",
|
||||||
|
"eps": "auto",
|
||||||
|
"weight_decay": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"scheduler": {
|
||||||
|
"type": "WarmupLR",
|
||||||
|
"params": {
|
||||||
|
"warmup_min_lr": "auto",
|
||||||
|
"warmup_max_lr": "auto",
|
||||||
|
"warmup_num_steps": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
"zero_optimization": {
|
||||||
|
"stage": 3,
|
||||||
|
"offload_optimizer": {
|
||||||
|
"device": "none",
|
||||||
|
"pin_memory": true
|
||||||
|
},
|
||||||
|
"offload_param": {
|
||||||
|
"device": "none",
|
||||||
|
"pin_memory": true
|
||||||
|
},
|
||||||
|
"overlap_comm": true,
|
||||||
|
"contiguous_gradients": true,
|
||||||
|
"sub_group_size": 1e9,
|
||||||
|
"reduce_bucket_size": "auto",
|
||||||
|
"stage3_prefetch_bucket_size": "auto",
|
||||||
|
"stage3_param_persistence_threshold": "auto",
|
||||||
|
"stage3_max_live_parameters": 1e9,
|
||||||
|
"stage3_max_reuse_distance": 1e9,
|
||||||
|
"stage3_gather_16bit_weights_on_model_save": true
|
||||||
|
},
|
||||||
|
|
||||||
|
"gradient_accumulation_steps": "auto",
|
||||||
|
"gradient_clipping": "auto",
|
||||||
|
"steps_per_print": 100,
|
||||||
|
"train_batch_size": "auto",
|
||||||
|
"train_micro_batch_size_per_gpu": "auto",
|
||||||
|
"wall_clock_breakdown": false
|
||||||
|
}
|
||||||
572
train/sft/example.rst
Normal file
572
train/sft/example.rst
Normal file
@@ -0,0 +1,572 @@
|
|||||||
|
Example
|
||||||
|
====================================================
|
||||||
|
|
||||||
|
Here we provide a very simple script for supervised finetuning, which is revised from the training
|
||||||
|
script in ```Fastchat`` <https://github.com/lm-sys/FastChat>`__. The
|
||||||
|
script is used to finetune Qwen with Hugging Face Trainer. You can check
|
||||||
|
the script
|
||||||
|
`here <https://github.com/QwenLM/Qwen1.5/blob/main/finetune.py>`__. This
|
||||||
|
script for supervised finetuning (SFT) has the following features:
|
||||||
|
|
||||||
|
- Support single-GPU and multi-GPU training;
|
||||||
|
- Support full-parameter tuning,
|
||||||
|
`LoRA <https://arxiv.org/abs/2106.09685>`__, and
|
||||||
|
`Q-LoRA <https://arxiv.org/abs/2305.14314>`__.
|
||||||
|
|
||||||
|
In the following, we introduce more details about the usage of the
|
||||||
|
script.
|
||||||
|
|
||||||
|
Installation
|
||||||
|
------------
|
||||||
|
|
||||||
|
Before you start, make sure you have installed the following packages:
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
pip install peft deepspeed optimum accelerate
|
||||||
|
|
||||||
|
Data Preparation
|
||||||
|
----------------
|
||||||
|
|
||||||
|
For data preparation, we advise you to organize the data in a jsonl
|
||||||
|
file, where each line is a dictionary as demonstrated below:
|
||||||
|
|
||||||
|
.. code:: json
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "chatml",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful assistant."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Tell me something about large language models."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "Large language models are a type of language model that is trained on a large corpus of text data. They are capable of generating human-like text and are used in a variety of natural language processing tasks..."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
.. code:: json
|
||||||
|
|
||||||
|
{
|
||||||
|
"type": "chatml",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful assistant."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "What is your name?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "My name is Qwen."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": "self-made"
|
||||||
|
}
|
||||||
|
|
||||||
|
Above are two examples of each data sample in the dataset. Each sample
|
||||||
|
is a JSON object with the following fields: ``type``, ``messages`` and
|
||||||
|
``source``. ``messages`` is required while the others are optional for
|
||||||
|
you to label your data format and data source. The ``messages`` field is
|
||||||
|
a list of JSON objects, each of which has two fields: ``role`` and
|
||||||
|
``content``. ``role`` can be ``system``, ``user``, or ``assistant``.
|
||||||
|
``content`` is the text of the message. ``source`` is the source of the
|
||||||
|
data, which can be ``self-made``, ``alpaca``, ``open-hermes``, or any
|
||||||
|
other string.
|
||||||
|
|
||||||
|
To make the jsonl file, you can use ``json`` to save a list of
|
||||||
|
dictionaries to the jsonl file:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
with open('data.jsonl', 'w') as f:
|
||||||
|
for sample in samples:
|
||||||
|
f.write(json.dumps(sample) + '\n')
|
||||||
|
|
||||||
|
Quickstart
|
||||||
|
----------
|
||||||
|
|
||||||
|
For you to start finetuning quickly, we directly provide a shell script
|
||||||
|
for you to run without paying attention to details. You need
|
||||||
|
different hyperparameters for different types of training, e.g.,
|
||||||
|
single-GPU / multi-GPU training, full-parameter tuning, LoRA, or Q-LoRA.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.. code:: bash
|
||||||
|
|
||||||
|
cd examples/sft
|
||||||
|
bash finetune.sh -m <model_path> -d <data_path> --deepspeed <config_path> [--use_lora True] [--q_lora True]
|
||||||
|
|
||||||
|
|
||||||
|
Specify the ``<model_path>`` for your model, ``<data_path>`` for your
|
||||||
|
data, and ``<config_path>`` for your deepspeed configuration.
|
||||||
|
If you use LoRA or Q-LoRA, just add ``--use_lora True`` or
|
||||||
|
``--q_lora True`` based on your requirements.
|
||||||
|
This is the simplest way to start finetuning. If you want to change more
|
||||||
|
hyperparameters, you can dive into the script and modify those
|
||||||
|
parameters.
|
||||||
|
|
||||||
|
Advanced Usages
|
||||||
|
---------------
|
||||||
|
|
||||||
|
In this section, we introduce the details of the scripts, including the
|
||||||
|
core python script as well as the corresponding shell script.
|
||||||
|
|
||||||
|
Shell Script
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Before we introduce the python code, we provide a brief introduction to
|
||||||
|
the shell script with commands. We provide some guidance inside the
|
||||||
|
shell script and here we take ``finetune.sh`` as an example.
|
||||||
|
|
||||||
|
To set up the environment variables for distributed training (or
|
||||||
|
single-GPU training), specify the following variables:
|
||||||
|
``GPUS_PER_NODE``, ``NNODES``, ``NODE_RANK``, ``MASTER_ADDR``, and
|
||||||
|
``MASTER_PORT``. No need to worry too much about them as we provide the
|
||||||
|
default settings for you. In the command, you can pass in the argument
|
||||||
|
``-m`` and ``-d`` to specify the model path and data path, respectively.
|
||||||
|
You can also pass in the argument ``--deepspeed`` to specify the
|
||||||
|
deepspeed configuration file. We provide two configuration files for
|
||||||
|
ZeRO2 and ZeRO3, and you can choose one based on your requirements. In
|
||||||
|
most cases, we recommend using ZeRO3 for multi-GPU training except for
|
||||||
|
Q-LoRA, where we recommend using ZeRO2.
|
||||||
|
|
||||||
|
There are a series of hyperparameters to tune. Passing in ``--bf16`` or
|
||||||
|
``--fp16`` to specify the precision for mixed precision training.
|
||||||
|
The other significant hyperparameters include:
|
||||||
|
|
||||||
|
- ``--output_dir``: the path of your output models or adapters.
|
||||||
|
- ``--num_train_epochs``: the number of training epochs.
|
||||||
|
- ``--gradient_accumulation_steps``: the number of gradient
|
||||||
|
accumulation steps.
|
||||||
|
- ``--per_device_train_batch_size``: the batch size per GPU for
|
||||||
|
training, and the total batch size is equalt to
|
||||||
|
``per_device_train_batch_size`` :math:`\times` ``number_of_gpus``
|
||||||
|
:math:`\times` ``gradient_accumulation_steps``.
|
||||||
|
- ``--learning_rate``: the learning rate.
|
||||||
|
- ``--warmup_steps``: the number of warmup steps.
|
||||||
|
- ``--lr_scheduler_type``: the type of learning rate scheduler.
|
||||||
|
- ``--weight_decay``: the value of weight decay.
|
||||||
|
- ``--adam_beta2``: the value of :math:`\beta_2` in Adam.
|
||||||
|
- ``--model_max_length``: the maximum sequence length.
|
||||||
|
- ``--use_lora``: whether to use LoRA. Adding ``--q_lora`` can enable
|
||||||
|
Q-LoRA.
|
||||||
|
- ``--gradient_checkpointing``: whether to use gradient checkpointing.
|
||||||
|
|
||||||
|
Python Script
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
In this script, we mainly use ``trainer`` from HF and ``peft`` to train
|
||||||
|
our models. We also use ``deepspeed`` to accelerate the training
|
||||||
|
process. The script is very simple and easy to understand.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
@dataclass
|
||||||
|
class ModelArguments:
|
||||||
|
model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DataArguments:
|
||||||
|
data_path: str = field(
|
||||||
|
default=None, metadata={"help": "Path to the training data."}
|
||||||
|
)
|
||||||
|
eval_data_path: str = field(
|
||||||
|
default=None, metadata={"help": "Path to the evaluation data."}
|
||||||
|
)
|
||||||
|
lazy_preprocess: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TrainingArguments(transformers.TrainingArguments):
|
||||||
|
cache_dir: Optional[str] = field(default=None)
|
||||||
|
optim: str = field(default="adamw_torch")
|
||||||
|
model_max_length: int = field(
|
||||||
|
default=8192,
|
||||||
|
metadata={
|
||||||
|
"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
use_lora: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LoraArguments:
|
||||||
|
lora_r: int = 64
|
||||||
|
lora_alpha: int = 16
|
||||||
|
lora_dropout: float = 0.05
|
||||||
|
lora_target_modules: List[str] = field(
|
||||||
|
default_factory=lambda: [
|
||||||
|
"q_proj",
|
||||||
|
"k_proj",
|
||||||
|
"v_proj",
|
||||||
|
"o_proj",
|
||||||
|
"up_proj",
|
||||||
|
"gate_proj",
|
||||||
|
"down_proj",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
lora_weight_path: str = ""
|
||||||
|
lora_bias: str = "none"
|
||||||
|
q_lora: bool = False
|
||||||
|
|
||||||
|
The classes for arguments allow you to specify hyperparameters for
|
||||||
|
model, data, training, and additionally LoRA if you use LoRA or Q-LoRA
|
||||||
|
to train your model. Specifically, ``model-max-length`` is a key
|
||||||
|
hyperparameter that determines your maximum sequence length of your
|
||||||
|
training data.
|
||||||
|
|
||||||
|
``LoRAArguments`` includes the hyperparameters for LoRA or Q-LoRA:
|
||||||
|
|
||||||
|
- ``lora_r``: the rank for LoRA;
|
||||||
|
- ``lora_alpha``: the alpha value for LoRA;
|
||||||
|
- ``lora_dropout``: the dropout rate for LoRA;
|
||||||
|
- ``lora_target_modules``: the target modules for LoRA. By default we
|
||||||
|
tune all linear layers;
|
||||||
|
- ``lora_weight_path``: the path to the weight file for LoRA;
|
||||||
|
- ``lora_bias``: the bias for LoRA;
|
||||||
|
- ``q_lora``: whether to use Q-LoRA.
|
||||||
|
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
def maybe_zero_3(param):
|
||||||
|
if hasattr(param, "ds_id"):
|
||||||
|
assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
|
||||||
|
with zero.GatheredParameters([param]):
|
||||||
|
param = param.data.detach().cpu().clone()
|
||||||
|
else:
|
||||||
|
param = param.detach().cpu().clone()
|
||||||
|
return param
|
||||||
|
|
||||||
|
|
||||||
|
# Borrowed from peft.utils.get_peft_model_state_dict
|
||||||
|
def get_peft_state_maybe_zero_3(named_params, bias):
|
||||||
|
if bias == "none":
|
||||||
|
to_return = {k: t for k, t in named_params if "lora_" in k}
|
||||||
|
elif bias == "all":
|
||||||
|
to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
|
||||||
|
elif bias == "lora_only":
|
||||||
|
to_return = {}
|
||||||
|
maybe_lora_bias = {}
|
||||||
|
lora_bias_names = set()
|
||||||
|
for k, t in named_params:
|
||||||
|
if "lora_" in k:
|
||||||
|
to_return[k] = t
|
||||||
|
bias_name = k.split("lora_")[0] + "bias"
|
||||||
|
lora_bias_names.add(bias_name)
|
||||||
|
elif "bias" in k:
|
||||||
|
maybe_lora_bias[k] = t
|
||||||
|
for k, t in maybe_lora_bias:
|
||||||
|
if bias_name in lora_bias_names:
|
||||||
|
to_return[bias_name] = t
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
|
||||||
|
return to_return
|
||||||
|
|
||||||
|
|
||||||
|
def safe_save_model_for_hf_trainer(
|
||||||
|
trainer: transformers.Trainer, output_dir: str, bias="none"
|
||||||
|
):
|
||||||
|
"""Collects the state dict and dump to disk."""
|
||||||
|
# check if zero3 mode enabled
|
||||||
|
if deepspeed.is_deepspeed_zero3_enabled():
|
||||||
|
state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
|
||||||
|
else:
|
||||||
|
if trainer.args.use_lora:
|
||||||
|
state_dict = get_peft_state_maybe_zero_3(
|
||||||
|
trainer.model.named_parameters(), bias
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
state_dict = trainer.model.state_dict()
|
||||||
|
if trainer.args.should_save and trainer.args.local_rank == 0:
|
||||||
|
trainer._save(output_dir, state_dict=state_dict)
|
||||||
|
|
||||||
|
The method ``safe_save_model_for_hf_trainer``, which uses
|
||||||
|
``get_peft_state_maybe_zero_3``, helps tackle the problems in saving
|
||||||
|
models trained either with or without ZeRO3.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
def preprocess(
|
||||||
|
messages,
|
||||||
|
tokenizer: transformers.PreTrainedTokenizer,
|
||||||
|
max_len: int,
|
||||||
|
) -> Dict:
|
||||||
|
"""Preprocesses the data for supervised fine-tuning."""
|
||||||
|
|
||||||
|
texts = []
|
||||||
|
for i, msg in enumerate(messages):
|
||||||
|
texts.append(
|
||||||
|
tokenizer.apply_chat_template(
|
||||||
|
msg,
|
||||||
|
tokenize=True,
|
||||||
|
add_generation_prompt=False,
|
||||||
|
padding=True,
|
||||||
|
max_length=max_len,
|
||||||
|
truncation=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
input_ids = torch.tensor(texts, dtype=torch.int)
|
||||||
|
target_ids = input_ids.clone()
|
||||||
|
target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID
|
||||||
|
attention_mask = input_ids.ne(tokenizer.pad_token_id)
|
||||||
|
|
||||||
|
return dict(
|
||||||
|
input_ids=input_ids, target_ids=target_ids, attention_mask=attention_mask
|
||||||
|
)
|
||||||
|
|
||||||
|
For data preprocessing, we use ``preprocess`` to organize the data.
|
||||||
|
Specifically, we apply our ChatML template to the texts. If you prefer
|
||||||
|
other chat templates, you can use others, e.g., by still applying
|
||||||
|
``apply_chat_template()`` with another tokenizer. The chat template is
|
||||||
|
stored in the ``tokenizer_config.json`` in the HF repo. Additionally, we
|
||||||
|
pad the sequence of each sample to the maximum length for training.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
class SupervisedDataset(Dataset):
|
||||||
|
"""Dataset for supervised fine-tuning."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
|
||||||
|
):
|
||||||
|
super(SupervisedDataset, self).__init__()
|
||||||
|
|
||||||
|
rank0_print("Formatting inputs...")
|
||||||
|
messages = [example["messages"] for example in raw_data]
|
||||||
|
data_dict = preprocess(messages, tokenizer, max_len)
|
||||||
|
|
||||||
|
self.input_ids = data_dict["input_ids"]
|
||||||
|
self.target_ids = data_dict["target_ids"]
|
||||||
|
self.attention_mask = data_dict["attention_mask"]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.input_ids)
|
||||||
|
|
||||||
|
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
|
||||||
|
return dict(
|
||||||
|
input_ids=self.input_ids[i],
|
||||||
|
labels=self.labels[i],
|
||||||
|
attention_mask=self.attention_mask[i],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LazySupervisedDataset(Dataset):
|
||||||
|
"""Dataset for supervised fine-tuning."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
|
||||||
|
):
|
||||||
|
super(LazySupervisedDataset, self).__init__()
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.max_len = max_len
|
||||||
|
|
||||||
|
rank0_print("Formatting inputs...Skip in lazy mode")
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.raw_data = raw_data
|
||||||
|
self.cached_data_dict = {}
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.raw_data)
|
||||||
|
|
||||||
|
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
|
||||||
|
if i in self.cached_data_dict:
|
||||||
|
return self.cached_data_dict[i]
|
||||||
|
|
||||||
|
ret = preprocess([self.raw_data[i]["messages"]], self.tokenizer, self.max_len)
|
||||||
|
ret = dict(
|
||||||
|
input_ids=ret["input_ids"][0],
|
||||||
|
labels=ret["target_ids"][0],
|
||||||
|
attention_mask=ret["attention_mask"][0],
|
||||||
|
)
|
||||||
|
self.cached_data_dict[i] = ret
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def make_supervised_data_module(
|
||||||
|
tokenizer: transformers.PreTrainedTokenizer,
|
||||||
|
data_args,
|
||||||
|
max_len,
|
||||||
|
) -> Dict:
|
||||||
|
"""Make dataset and collator for supervised fine-tuning."""
|
||||||
|
dataset_cls = (
|
||||||
|
LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
|
||||||
|
)
|
||||||
|
rank0_print("Loading data...")
|
||||||
|
|
||||||
|
train_data = []
|
||||||
|
with open(data_args.data_path, "r") as f:
|
||||||
|
for line in f:
|
||||||
|
train_data.append(json.loads(line))
|
||||||
|
train_dataset = dataset_cls(train_data, tokenizer=tokenizer, max_len=max_len)
|
||||||
|
|
||||||
|
if data_args.eval_data_path:
|
||||||
|
eval_data = []
|
||||||
|
with open(data_args.eval_data_path, "r") as f:
|
||||||
|
for line in f:
|
||||||
|
eval_data.append(json.loads(line))
|
||||||
|
eval_dataset = dataset_cls(eval_data, tokenizer=tokenizer, max_len=max_len)
|
||||||
|
else:
|
||||||
|
eval_dataset = None
|
||||||
|
|
||||||
|
return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
|
||||||
|
|
||||||
|
Then we utilize ``make_supervised_data_module`` by using
|
||||||
|
``SupervisedDataset`` or ``LazySupervisedDataset`` to build the dataset.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
def train():
|
||||||
|
global local_rank
|
||||||
|
|
||||||
|
parser = transformers.HfArgumentParser(
|
||||||
|
(ModelArguments, DataArguments, TrainingArguments, LoraArguments)
|
||||||
|
)
|
||||||
|
(
|
||||||
|
model_args,
|
||||||
|
data_args,
|
||||||
|
training_args,
|
||||||
|
lora_args,
|
||||||
|
) = parser.parse_args_into_dataclasses()
|
||||||
|
|
||||||
|
# This serves for single-gpu qlora.
|
||||||
|
if (
|
||||||
|
getattr(training_args, "deepspeed", None)
|
||||||
|
and int(os.environ.get("WORLD_SIZE", 1)) == 1
|
||||||
|
):
|
||||||
|
training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
|
||||||
|
|
||||||
|
local_rank = training_args.local_rank
|
||||||
|
|
||||||
|
device_map = None
|
||||||
|
world_size = int(os.environ.get("WORLD_SIZE", 1))
|
||||||
|
ddp = world_size != 1
|
||||||
|
if lora_args.q_lora:
|
||||||
|
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto"
|
||||||
|
if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
|
||||||
|
logging.warning("FSDP or ZeRO3 is incompatible with QLoRA.")
|
||||||
|
|
||||||
|
model_load_kwargs = {
|
||||||
|
"low_cpu_mem_usage": not deepspeed.is_deepspeed_zero3_enabled(),
|
||||||
|
}
|
||||||
|
|
||||||
|
compute_dtype = (
|
||||||
|
torch.float16
|
||||||
|
if training_args.fp16
|
||||||
|
else (torch.bfloat16 if training_args.bf16 else torch.float32)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load model and tokenizer
|
||||||
|
config = transformers.AutoConfig.from_pretrained(
|
||||||
|
model_args.model_name_or_path,
|
||||||
|
cache_dir=training_args.cache_dir,
|
||||||
|
)
|
||||||
|
config.use_cache = False
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_args.model_name_or_path,
|
||||||
|
config=config,
|
||||||
|
cache_dir=training_args.cache_dir,
|
||||||
|
device_map=device_map,
|
||||||
|
quantization_config=BitsAndBytesConfig(
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_use_double_quant=True,
|
||||||
|
bnb_4bit_quant_type="nf4",
|
||||||
|
bnb_4bit_compute_dtype=compute_dtype,
|
||||||
|
)
|
||||||
|
if training_args.use_lora and lora_args.q_lora
|
||||||
|
else None,
|
||||||
|
**model_load_kwargs,
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
model_args.model_name_or_path,
|
||||||
|
cache_dir=training_args.cache_dir,
|
||||||
|
model_max_length=training_args.model_max_length,
|
||||||
|
padding_side="right",
|
||||||
|
use_fast=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if training_args.use_lora:
|
||||||
|
lora_config = LoraConfig(
|
||||||
|
r=lora_args.lora_r,
|
||||||
|
lora_alpha=lora_args.lora_alpha,
|
||||||
|
target_modules=lora_args.lora_target_modules,
|
||||||
|
lora_dropout=lora_args.lora_dropout,
|
||||||
|
bias=lora_args.lora_bias,
|
||||||
|
task_type="CAUSAL_LM",
|
||||||
|
)
|
||||||
|
if lora_args.q_lora:
|
||||||
|
model = prepare_model_for_kbit_training(
|
||||||
|
model, use_gradient_checkpointing=training_args.gradient_checkpointing
|
||||||
|
)
|
||||||
|
|
||||||
|
model = get_peft_model(model, lora_config)
|
||||||
|
|
||||||
|
# Print peft trainable params
|
||||||
|
model.print_trainable_parameters()
|
||||||
|
|
||||||
|
if training_args.gradient_checkpointing:
|
||||||
|
model.enable_input_require_grads()
|
||||||
|
|
||||||
|
# Load data
|
||||||
|
data_module = make_supervised_data_module(
|
||||||
|
tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start trainer
|
||||||
|
trainer = Trainer(
|
||||||
|
model=model, tokenizer=tokenizer, args=training_args, **data_module
|
||||||
|
)
|
||||||
|
|
||||||
|
# `not training_args.use_lora` is a temporary workaround for the issue that there are problems with
|
||||||
|
# loading the checkpoint when using LoRA with DeepSpeed.
|
||||||
|
# Check this issue https://github.com/huggingface/peft/issues/746 for more information.
|
||||||
|
if (
|
||||||
|
list(pathlib.Path(training_args.output_dir).glob("checkpoint-*"))
|
||||||
|
and not training_args.use_lora
|
||||||
|
):
|
||||||
|
trainer.train(resume_from_checkpoint=True)
|
||||||
|
else:
|
||||||
|
trainer.train()
|
||||||
|
trainer.save_state()
|
||||||
|
|
||||||
|
safe_save_model_for_hf_trainer(
|
||||||
|
trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias
|
||||||
|
)
|
||||||
|
|
||||||
|
The ``train`` method is the key to the training. In general, it loads
|
||||||
|
the tokenizer and model with ``AutoTokenizer.from_pretrained()`` and
|
||||||
|
``AutoModelForCausalLM.from_pretrained()``. If we use LoRA, the method
|
||||||
|
will initialize LoRA configuration with ``LoraConfig``. If we apply
|
||||||
|
Q-LoRA, we should use ``prepare_model_for_kbit_training``. Note that for
|
||||||
|
now it still does not support resume for LoRA. Then we leave the
|
||||||
|
following efforts to ``trainer`` and have a cup of coffee!
|
||||||
|
|
||||||
|
Next Step
|
||||||
|
---------
|
||||||
|
|
||||||
|
Now, you are able to use a very simple script to perform different types
|
||||||
|
of SFT. Alternatively, you can use more advanced training libraries,
|
||||||
|
such as
|
||||||
|
`Axolotl <https://github.com/OpenAccess-AI-Collective/axolotl>`__ or
|
||||||
|
`LLaMA-Factory <https://github.com/hiyouga/LLaMA-Factory>`__, to enjoy
|
||||||
|
more functionalities. To take a step forward, after SFT, you can
|
||||||
|
consider RLHF to align your model to human preferences! Stay tuned for
|
||||||
|
our next tutorial on RLHF!
|
||||||
378
train/sft/finetune.py
Normal file
378
train/sft/finetune.py
Normal file
@@ -0,0 +1,378 @@
|
|||||||
|
# This code is based on the revised code from fastchat based on tatsu-lab/stanford_alpaca.
|
||||||
|
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
from typing import Dict, Optional, List
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
from deepspeed import zero
|
||||||
|
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
|
||||||
|
from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
|
||||||
|
import transformers
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
from transformers import Trainer, BitsAndBytesConfig, deepspeed
|
||||||
|
from transformers.trainer_pt_utils import LabelSmoother
|
||||||
|
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
||||||
|
from accelerate.utils import DistributedType
|
||||||
|
|
||||||
|
|
||||||
|
IGNORE_TOKEN_ID = LabelSmoother.ignore_index
|
||||||
|
|
||||||
|
TEMPLATE = "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{ '<|im_end|>'}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}"
|
||||||
|
|
||||||
|
local_rank = None
|
||||||
|
|
||||||
|
|
||||||
|
def rank0_print(*args):
|
||||||
|
if local_rank == 0:
|
||||||
|
print(*args)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ModelArguments:
|
||||||
|
model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DataArguments:
|
||||||
|
data_path: str = field(
|
||||||
|
default=None, metadata={"help": "Path to the training data."}
|
||||||
|
)
|
||||||
|
eval_data_path: str = field(
|
||||||
|
default=None, metadata={"help": "Path to the evaluation data."}
|
||||||
|
)
|
||||||
|
lazy_preprocess: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TrainingArguments(transformers.TrainingArguments):
|
||||||
|
cache_dir: Optional[str] = field(default=None)
|
||||||
|
optim: str = field(default="adamw_torch")
|
||||||
|
model_max_length: int = field(
|
||||||
|
default=8192,
|
||||||
|
metadata={
|
||||||
|
"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
use_lora: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LoraArguments:
|
||||||
|
lora_r: int = 64
|
||||||
|
lora_alpha: int = 16
|
||||||
|
lora_dropout: float = 0.05
|
||||||
|
lora_target_modules: List[str] = field(
|
||||||
|
default_factory=lambda: [
|
||||||
|
"q_proj",
|
||||||
|
"k_proj",
|
||||||
|
"v_proj",
|
||||||
|
"o_proj",
|
||||||
|
"up_proj",
|
||||||
|
"gate_proj",
|
||||||
|
"down_proj",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
lora_weight_path: str = ""
|
||||||
|
lora_bias: str = "none"
|
||||||
|
q_lora: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_zero_3(param):
|
||||||
|
if hasattr(param, "ds_id"):
|
||||||
|
assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
|
||||||
|
with zero.GatheredParameters([param]):
|
||||||
|
param = param.data.detach().cpu().clone()
|
||||||
|
else:
|
||||||
|
param = param.detach().cpu().clone()
|
||||||
|
return param
|
||||||
|
|
||||||
|
|
||||||
|
# Borrowed from peft.utils.get_peft_model_state_dict
|
||||||
|
def get_peft_state_maybe_zero_3(named_params, bias):
|
||||||
|
if bias == "none":
|
||||||
|
to_return = {k: t for k, t in named_params if "lora_" in k}
|
||||||
|
elif bias == "all":
|
||||||
|
to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
|
||||||
|
elif bias == "lora_only":
|
||||||
|
to_return = {}
|
||||||
|
maybe_lora_bias = {}
|
||||||
|
lora_bias_names = set()
|
||||||
|
for k, t in named_params:
|
||||||
|
if "lora_" in k:
|
||||||
|
to_return[k] = t
|
||||||
|
bias_name = k.split("lora_")[0] + "bias"
|
||||||
|
lora_bias_names.add(bias_name)
|
||||||
|
elif "bias" in k:
|
||||||
|
maybe_lora_bias[k] = t
|
||||||
|
for k, t in maybe_lora_bias:
|
||||||
|
if bias_name in lora_bias_names:
|
||||||
|
to_return[bias_name] = t
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
|
||||||
|
return to_return
|
||||||
|
|
||||||
|
|
||||||
|
def safe_save_model_for_hf_trainer(
|
||||||
|
trainer: transformers.Trainer, output_dir: str, bias="none"
|
||||||
|
):
|
||||||
|
"""Collects the state dict and dump to disk."""
|
||||||
|
# check if zero3 mode enabled
|
||||||
|
if deepspeed.is_deepspeed_zero3_enabled():
|
||||||
|
state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
|
||||||
|
else:
|
||||||
|
if trainer.args.use_lora:
|
||||||
|
state_dict = get_peft_state_maybe_zero_3(
|
||||||
|
trainer.model.named_parameters(), bias
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
state_dict = trainer.model.state_dict()
|
||||||
|
if trainer.args.should_save and trainer.args.local_rank == 0:
|
||||||
|
trainer._save(output_dir, state_dict=state_dict)
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(
|
||||||
|
messages,
|
||||||
|
tokenizer: transformers.PreTrainedTokenizer,
|
||||||
|
max_len: int,
|
||||||
|
) -> Dict:
|
||||||
|
"""Preprocesses the data for supervised fine-tuning."""
|
||||||
|
|
||||||
|
texts = []
|
||||||
|
for i, msg in enumerate(messages):
|
||||||
|
texts.append(
|
||||||
|
tokenizer.apply_chat_template(
|
||||||
|
msg,
|
||||||
|
chat_template=TEMPLATE,
|
||||||
|
tokenize=True,
|
||||||
|
add_generation_prompt=False,
|
||||||
|
padding=True,
|
||||||
|
max_length=max_len,
|
||||||
|
truncation=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
input_ids = torch.tensor(texts, dtype=torch.int)
|
||||||
|
target_ids = input_ids.clone()
|
||||||
|
target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID
|
||||||
|
attention_mask = input_ids.ne(tokenizer.pad_token_id)
|
||||||
|
|
||||||
|
return dict(
|
||||||
|
input_ids=input_ids, target_ids=target_ids, attention_mask=attention_mask
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SupervisedDataset(Dataset):
|
||||||
|
"""Dataset for supervised fine-tuning."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
|
||||||
|
):
|
||||||
|
super(SupervisedDataset, self).__init__()
|
||||||
|
|
||||||
|
rank0_print("Formatting inputs...")
|
||||||
|
messages = [example["messages"] for example in raw_data]
|
||||||
|
data_dict = preprocess(messages, tokenizer, max_len)
|
||||||
|
|
||||||
|
self.input_ids = data_dict["input_ids"]
|
||||||
|
self.target_ids = data_dict["target_ids"]
|
||||||
|
self.attention_mask = data_dict["attention_mask"]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.input_ids)
|
||||||
|
|
||||||
|
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
|
||||||
|
return dict(
|
||||||
|
input_ids=self.input_ids[i],
|
||||||
|
labels=self.target_ids[i],
|
||||||
|
attention_mask=self.attention_mask[i],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LazySupervisedDataset(Dataset):
|
||||||
|
"""Dataset for supervised fine-tuning."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
|
||||||
|
):
|
||||||
|
super(LazySupervisedDataset, self).__init__()
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.max_len = max_len
|
||||||
|
|
||||||
|
rank0_print("Formatting inputs...Skip in lazy mode")
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.raw_data = raw_data
|
||||||
|
self.cached_data_dict = {}
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.raw_data)
|
||||||
|
|
||||||
|
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
|
||||||
|
if i in self.cached_data_dict:
|
||||||
|
return self.cached_data_dict[i]
|
||||||
|
|
||||||
|
ret = preprocess([self.raw_data[i]["messages"]], self.tokenizer, self.max_len)
|
||||||
|
ret = dict(
|
||||||
|
input_ids=ret["input_ids"][0],
|
||||||
|
labels=ret["target_ids"][0],
|
||||||
|
attention_mask=ret["attention_mask"][0],
|
||||||
|
)
|
||||||
|
self.cached_data_dict[i] = ret
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def make_supervised_data_module(
|
||||||
|
tokenizer: transformers.PreTrainedTokenizer,
|
||||||
|
data_args,
|
||||||
|
max_len,
|
||||||
|
) -> Dict:
|
||||||
|
"""Make dataset and collator for supervised fine-tuning."""
|
||||||
|
dataset_cls = (
|
||||||
|
LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
|
||||||
|
)
|
||||||
|
rank0_print("Loading data...")
|
||||||
|
|
||||||
|
train_data = []
|
||||||
|
with open(data_args.data_path, "r") as f:
|
||||||
|
for line in f:
|
||||||
|
train_data.append(json.loads(line))
|
||||||
|
train_dataset = dataset_cls(train_data, tokenizer=tokenizer, max_len=max_len)
|
||||||
|
|
||||||
|
if data_args.eval_data_path:
|
||||||
|
eval_data = []
|
||||||
|
with open(data_args.eval_data_path, "r") as f:
|
||||||
|
for line in f:
|
||||||
|
eval_data.append(json.loads(line))
|
||||||
|
eval_dataset = dataset_cls(eval_data, tokenizer=tokenizer, max_len=max_len)
|
||||||
|
else:
|
||||||
|
eval_dataset = None
|
||||||
|
|
||||||
|
return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
|
||||||
|
|
||||||
|
|
||||||
|
def train():
|
||||||
|
global local_rank
|
||||||
|
|
||||||
|
parser = transformers.HfArgumentParser(
|
||||||
|
(ModelArguments, DataArguments, TrainingArguments, LoraArguments)
|
||||||
|
)
|
||||||
|
(
|
||||||
|
model_args,
|
||||||
|
data_args,
|
||||||
|
training_args,
|
||||||
|
lora_args,
|
||||||
|
) = parser.parse_args_into_dataclasses()
|
||||||
|
|
||||||
|
# This serves for single-gpu qlora.
|
||||||
|
if (
|
||||||
|
getattr(training_args, "deepspeed", None)
|
||||||
|
and int(os.environ.get("WORLD_SIZE", 1)) == 1
|
||||||
|
):
|
||||||
|
training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
|
||||||
|
|
||||||
|
local_rank = training_args.local_rank
|
||||||
|
|
||||||
|
device_map = None
|
||||||
|
world_size = int(os.environ.get("WORLD_SIZE", 1))
|
||||||
|
ddp = world_size != 1
|
||||||
|
if lora_args.q_lora:
|
||||||
|
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto"
|
||||||
|
if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
|
||||||
|
logging.warning("FSDP or ZeRO3 is incompatible with QLoRA.")
|
||||||
|
|
||||||
|
model_load_kwargs = {
|
||||||
|
"low_cpu_mem_usage": not deepspeed.is_deepspeed_zero3_enabled(),
|
||||||
|
}
|
||||||
|
|
||||||
|
compute_dtype = (
|
||||||
|
torch.float16
|
||||||
|
if training_args.fp16
|
||||||
|
else (torch.bfloat16 if training_args.bf16 else torch.float32)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load model and tokenizer
|
||||||
|
config = transformers.AutoConfig.from_pretrained(
|
||||||
|
model_args.model_name_or_path,
|
||||||
|
cache_dir=training_args.cache_dir,
|
||||||
|
)
|
||||||
|
config.use_cache = False
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_args.model_name_or_path,
|
||||||
|
config=config,
|
||||||
|
cache_dir=training_args.cache_dir,
|
||||||
|
device_map=device_map,
|
||||||
|
quantization_config=BitsAndBytesConfig(
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_use_double_quant=True,
|
||||||
|
bnb_4bit_quant_type="nf4",
|
||||||
|
bnb_4bit_compute_dtype=compute_dtype,
|
||||||
|
)
|
||||||
|
if training_args.use_lora and lora_args.q_lora
|
||||||
|
else None,
|
||||||
|
**model_load_kwargs,
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
model_args.model_name_or_path,
|
||||||
|
cache_dir=training_args.cache_dir,
|
||||||
|
model_max_length=training_args.model_max_length,
|
||||||
|
padding_side="right",
|
||||||
|
use_fast=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if training_args.use_lora:
|
||||||
|
lora_config = LoraConfig(
|
||||||
|
r=lora_args.lora_r,
|
||||||
|
lora_alpha=lora_args.lora_alpha,
|
||||||
|
target_modules=lora_args.lora_target_modules,
|
||||||
|
lora_dropout=lora_args.lora_dropout,
|
||||||
|
bias=lora_args.lora_bias,
|
||||||
|
task_type="CAUSAL_LM",
|
||||||
|
)
|
||||||
|
if lora_args.q_lora:
|
||||||
|
model = prepare_model_for_kbit_training(
|
||||||
|
model, use_gradient_checkpointing=training_args.gradient_checkpointing
|
||||||
|
)
|
||||||
|
|
||||||
|
model = get_peft_model(model, lora_config)
|
||||||
|
|
||||||
|
# Print peft trainable params
|
||||||
|
model.print_trainable_parameters()
|
||||||
|
|
||||||
|
if training_args.gradient_checkpointing:
|
||||||
|
model.enable_input_require_grads()
|
||||||
|
|
||||||
|
# Load data
|
||||||
|
data_module = make_supervised_data_module(
|
||||||
|
tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start trainer
|
||||||
|
trainer = Trainer(
|
||||||
|
model=model, tokenizer=tokenizer, args=training_args, **data_module
|
||||||
|
)
|
||||||
|
|
||||||
|
# `not training_args.use_lora` is a temporary workaround for the issue that there are problems with
|
||||||
|
# loading the checkpoint when using LoRA with DeepSpeed.
|
||||||
|
# Check this issue https://github.com/huggingface/peft/issues/746 for more information.
|
||||||
|
if (
|
||||||
|
list(pathlib.Path(training_args.output_dir).glob("checkpoint-*"))
|
||||||
|
and not training_args.use_lora
|
||||||
|
):
|
||||||
|
trainer.train(resume_from_checkpoint=True)
|
||||||
|
else:
|
||||||
|
trainer.train()
|
||||||
|
trainer.save_state()
|
||||||
|
|
||||||
|
safe_save_model_for_hf_trainer(
|
||||||
|
trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
train()
|
||||||
107
train/sft/finetune.sh
Executable file
107
train/sft/finetune.sh
Executable file
@@ -0,0 +1,107 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||||
|
DIR=`pwd`
|
||||||
|
|
||||||
|
# Guide:
|
||||||
|
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
|
||||||
|
# Please set the options below according to the comments.
|
||||||
|
# For multi-gpu workers training, these options should be manually set for each worker.
|
||||||
|
# After setting the options, please run the script on each worker.
|
||||||
|
|
||||||
|
# Number of GPUs per GPU worker
|
||||||
|
GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
|
||||||
|
|
||||||
|
# Number of GPU workers, for single-worker training, please set to 1
|
||||||
|
NNODES=${NNODES:-1}
|
||||||
|
|
||||||
|
# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
|
||||||
|
NODE_RANK=${NODE_RANK:-0}
|
||||||
|
|
||||||
|
# The ip address of the rank-0 worker, for single-worker training, please set to localhost
|
||||||
|
MASTER_ADDR=${MASTER_ADDR:-localhost}
|
||||||
|
|
||||||
|
# The port for communication
|
||||||
|
MASTER_PORT=${MASTER_PORT:-6001}
|
||||||
|
|
||||||
|
MODEL="Qwen/Qwen1.5-7B" # Set the path if you do not want to load from huggingface directly
|
||||||
|
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
|
||||||
|
# See the section for finetuning in README for more information.
|
||||||
|
DATA="path_to_data"
|
||||||
|
DS_CONFIG_PATH="ds_config_zero3.json"
|
||||||
|
USE_LORA=False
|
||||||
|
Q_LORA=False
|
||||||
|
|
||||||
|
function usage() {
|
||||||
|
echo '
|
||||||
|
Usage: bash finetune/finetune_lora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] [--deepspeed DS_CONFIG_PATH] [--use_lora USE_LORA] [--q_lora Q_LORA]
|
||||||
|
'
|
||||||
|
}
|
||||||
|
|
||||||
|
while [[ "$1" != "" ]]; do
|
||||||
|
case $1 in
|
||||||
|
-m | --model )
|
||||||
|
shift
|
||||||
|
MODEL=$1
|
||||||
|
;;
|
||||||
|
-d | --data )
|
||||||
|
shift
|
||||||
|
DATA=$1
|
||||||
|
;;
|
||||||
|
--deepspeed )
|
||||||
|
shift
|
||||||
|
DS_CONFIG_PATH=$1
|
||||||
|
;;
|
||||||
|
--use_lora )
|
||||||
|
shift
|
||||||
|
USE_LORA=$1
|
||||||
|
;;
|
||||||
|
--q_lora )
|
||||||
|
shift
|
||||||
|
Q_LORA=$1
|
||||||
|
;;
|
||||||
|
-h | --help )
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
* )
|
||||||
|
echo "Unknown argument ${1}"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
shift
|
||||||
|
done
|
||||||
|
|
||||||
|
DISTRIBUTED_ARGS="
|
||||||
|
--nproc_per_node $GPUS_PER_NODE \
|
||||||
|
--nnodes $NNODES \
|
||||||
|
--node_rank $NODE_RANK \
|
||||||
|
--master_addr $MASTER_ADDR \
|
||||||
|
--master_port $MASTER_PORT
|
||||||
|
"
|
||||||
|
|
||||||
|
torchrun $DISTRIBUTED_ARGS finetune.py \
|
||||||
|
--model_name_or_path $MODEL \
|
||||||
|
--data_path $DATA \
|
||||||
|
--bf16 False \
|
||||||
|
--output_dir output_qwen \
|
||||||
|
--num_train_epochs 5 \
|
||||||
|
--per_device_train_batch_size 2 \
|
||||||
|
--per_device_eval_batch_size 1 \
|
||||||
|
--gradient_accumulation_steps 8 \
|
||||||
|
--evaluation_strategy "no" \
|
||||||
|
--save_strategy "steps" \
|
||||||
|
--save_steps 10 \
|
||||||
|
--save_total_limit 10 \
|
||||||
|
--learning_rate 3e-4 \
|
||||||
|
--weight_decay 0.01 \
|
||||||
|
--adam_beta2 0.95 \
|
||||||
|
--warmup_ratio 0.01 \
|
||||||
|
--lr_scheduler_type "cosine" \
|
||||||
|
--logging_steps 1 \
|
||||||
|
--report_to "none" \
|
||||||
|
--model_max_length 512 \
|
||||||
|
--lazy_preprocess True \
|
||||||
|
--use_lora ${USE_LORA} \
|
||||||
|
--q_lora ${Q_LORA} \
|
||||||
|
--gradient_checkpointing \
|
||||||
|
--deepspeed ${DS_CONFIG_PATH}
|
||||||
0
utils/__init__.py
Normal file
0
utils/__init__.py
Normal file
61
utils/conversation.py
Normal file
61
utils/conversation.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
|
def load_conversation(model_id):
|
||||||
|
folder = 'conversations/' + model_id.replace('/', '_')
|
||||||
|
mkdir('../conversations')
|
||||||
|
mkdir(folder)
|
||||||
|
|
||||||
|
files = os.listdir(folder)
|
||||||
|
files = [file for file in files if file.endswith(".pickle") and os.path.isfile(folder + '/' + file)]
|
||||||
|
files.sort(reverse=True)
|
||||||
|
if len(files) > 0:
|
||||||
|
pickle_filename = folder + '/' + files[0]
|
||||||
|
print('Loading last conversation from ' + pickle_filename)
|
||||||
|
with open(pickle_filename, 'rb') as file:
|
||||||
|
return pickle.load(file)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def save_conversation(model_id, messages):
|
||||||
|
folder = 'conversations/' + model_id.replace('/', '_')
|
||||||
|
mkdir('../conversations')
|
||||||
|
mkdir(folder)
|
||||||
|
timestamp = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
|
||||||
|
pickle_filename = folder + '/' + timestamp + '.pickle'
|
||||||
|
with open(pickle_filename, 'wb') as file:
|
||||||
|
pickle.dump(messages, file)
|
||||||
|
|
||||||
|
|
||||||
|
def load_conversation_json(model_id):
|
||||||
|
folder = 'conversations/' + model_id.replace('/', '_')
|
||||||
|
mkdir('../conversations')
|
||||||
|
mkdir(folder)
|
||||||
|
|
||||||
|
files = os.listdir(folder)
|
||||||
|
files = [file for file in files if file.endswith(".json") and os.path.isfile(folder + '/' + file)]
|
||||||
|
files.sort(reverse=True)
|
||||||
|
if len(files) > 0:
|
||||||
|
pickle_filename = folder + '/' + files[0]
|
||||||
|
print('Loading last conversation from ' + pickle_filename)
|
||||||
|
with open(pickle_filename, 'r') as file:
|
||||||
|
return json.load(file)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def save_conversation_json(model_id, messages):
|
||||||
|
folder = 'conversations/' + model_id.replace('/', '_')
|
||||||
|
mkdir('../conversations')
|
||||||
|
mkdir(folder)
|
||||||
|
timestamp = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
|
||||||
|
pickle_filename = folder + '/' + timestamp + '.json'
|
||||||
|
with open(pickle_filename, 'w') as file:
|
||||||
|
json.dump(messages, file)
|
||||||
|
|
||||||
|
|
||||||
|
def mkdir(path):
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.mkdir(path)
|
||||||
12
utils/download_dataset.py
Normal file
12
utils/download_dataset.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
import pickle
|
||||||
|
from conversation import mkdir
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
dataset_id = 'OpenAssistant/oasst2'
|
||||||
|
|
||||||
|
mkdir('../datasets')
|
||||||
|
pickle_filename = './datasets/' + dataset_id.replace('/', '_') + '.pickle'
|
||||||
|
dataset = load_dataset(dataset_id)
|
||||||
|
with open(pickle_filename, 'wb') as file:
|
||||||
|
pickle.dump(dataset, file)
|
||||||
|
print('Saved as pickle to ' + pickle_filename)
|
||||||
18
utils/download_model.py
Normal file
18
utils/download_model.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer
|
||||||
|
|
||||||
|
from conversation import mkdir
|
||||||
|
|
||||||
|
model_id = 'Qwen/Qwen1.5-0.5B-Chat'
|
||||||
|
# model_id = 'Qwen/Qwen1.5-1.8B-Chat'
|
||||||
|
|
||||||
|
print('Downloading ' + model_id)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto')
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
print('Downloaded')
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
mkdir('models')
|
||||||
|
trainer.save_model('./models/' + model_id.replace('/', '_'))
|
||||||
3
utils/fix_cuda.sh
Executable file
3
utils/fix_cuda.sh
Executable file
@@ -0,0 +1,3 @@
|
|||||||
|
#https://medium.com/@Spritan/dealing-with-cuda-initialization-error-aa7c88d021e4
|
||||||
|
sudo rmmod nvidia_uvm
|
||||||
|
sudo modprobe nvidia_uvm
|
||||||
22
utils/pickle2json.py
Normal file
22
utils/pickle2json.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
import json
|
||||||
|
import pickle
|
||||||
|
import sys
|
||||||
|
|
||||||
|
files = sys.argv[1:]
|
||||||
|
print(files)
|
||||||
|
|
||||||
|
for pickle_filename in files:
|
||||||
|
if not pickle_filename.endswith('.pickle'):
|
||||||
|
print(pickle_filename + ' is not a pickle. ignoring')
|
||||||
|
continue
|
||||||
|
|
||||||
|
with open(pickle_filename, 'rb') as file:
|
||||||
|
obj = pickle.load(file)
|
||||||
|
print(obj)
|
||||||
|
|
||||||
|
json_filename = pickle_filename[0:-6] + 'json'
|
||||||
|
try:
|
||||||
|
with open(json_filename, 'w') as file:
|
||||||
|
json.dump(obj, file)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
14
utils/prompt.py
Normal file
14
utils/prompt.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
def prompt(prompt):
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
return input(prompt)
|
||||||
|
except EOFError:
|
||||||
|
print()
|
||||||
|
exit(0)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print()
|
||||||
|
exit(0)
|
||||||
|
# in case: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 11: invalid continuation byte
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
print(e)
|
||||||
|
print('prompt ignored')
|
||||||
25
utils/split_shuffle_dataset.py
Normal file
25
utils/split_shuffle_dataset.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
|
||||||
|
original = sys.argv[1]
|
||||||
|
no_dataset = int(sys.argv[2])
|
||||||
|
|
||||||
|
if not original.endswith('.jsonl') or not os.path.isfile(original):
|
||||||
|
print('Not a jsonl file')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
out_dir = os.path.dirname(os.path.abspath(original))
|
||||||
|
|
||||||
|
with open(original, 'r') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
random.shuffle(lines)
|
||||||
|
|
||||||
|
for i in range(no_dataset):
|
||||||
|
l = int(i * len(lines) / no_dataset)
|
||||||
|
u = int((i + 1) * len(lines) / no_dataset)
|
||||||
|
out_filename = os.path.basename(original)[0:-6].replace('_all', '_' + str(i)) + '.jsonl'
|
||||||
|
with open(out_dir + '/' + out_filename, 'w') as f:
|
||||||
|
f.writelines(lines[l:u])
|
||||||
Reference in New Issue
Block a user