commit 41d5b4f5910082bcccfd95495b316f9d6bcf9d41 Author: wea_ondara Date: Wed Apr 17 18:58:50 2024 +0200 initial commit diff --git a/chat.py b/chat.py new file mode 100644 index 0000000..56df79c --- /dev/null +++ b/chat.py @@ -0,0 +1,16 @@ +import requests + +from utils.prompt import prompt + +messages = [] +while True: + user_prompt = prompt('>> User: ') + messages.append({'role': 'user', 'content': user_prompt}) + + response = requests.post('http://localhost:8900/', json=messages) + if response.status_code == 200: + messages = response.json() + print('>> Bot : ' + messages[-1]['content']) + else: + messages = messages[0:-1] + print('### Error from backend') diff --git a/chat_dialogpt.py b/chat_dialogpt.py new file mode 100644 index 0000000..5336583 --- /dev/null +++ b/chat_dialogpt.py @@ -0,0 +1,36 @@ +import atexit +import torch +from utils.conversation import save_conversation +from utils.prompt import prompt +from transformers import AutoModelForCausalLM, AutoTokenizer + + +device = 'cuda' # the device to load the model onto +model_id = 'microsoft/DialoGPT-medium' + +print('Loading ' + model_id) +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto') +# model = AutoModelForCausalLM.from_pretrained(model_id) +tokenizer = AutoTokenizer.from_pretrained(model_id) +print('Loaded') +# print(tokenizer.default_chat_template) + +# read and save conversation +chat_history_ids = None +# messages = load_conversation(model_id) +atexit.register(lambda: save_conversation(model_id, bot_input_ids)) + +# messages.append({'role': 'system', 'content': 'Your name is "Laura". You are an AI created by Alice.'}) +while True: + user_prompt = prompt('>> User: ') + new_user_input_ids = tokenizer.encode(user_prompt + tokenizer.eos_token, return_tensors='pt').to(device) + + bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) \ + if chat_history_ids is not None \ + else new_user_input_ids + chat_history_ids = model.generate(bot_input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id).to(device) + response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True) + + print('>> Bot : ' + response) + torch.cuda.empty_cache() + diff --git a/chat_gpt2.py b/chat_gpt2.py new file mode 100644 index 0000000..9d31d10 --- /dev/null +++ b/chat_gpt2.py @@ -0,0 +1,34 @@ +import atexit +import torch +from utils.conversation import save_conversation +from utils.prompt import prompt +from transformers import AutoModelForCausalLM, AutoTokenizer + +device = 'cuda' # the device to load the model onto +model_id = 'gpt2' + +print('Loading ' + model_id) +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto') +tokenizer = AutoTokenizer.from_pretrained(model_id) +print('Loaded') +# print(tokenizer.default_chat_template) + +# read and save conversation +chat_history_ids = None +# messages = load_conversation(model_id) +atexit.register(lambda: save_conversation(model_id, bot_input_ids)) + +# messages.append({'role': 'system', 'content': 'Your name is "Laura". You are an AI created by Alice.'}) +while True: + user_prompt = prompt('>> User: ') + new_user_input_ids = tokenizer.encode(user_prompt + tokenizer.eos_token, return_tensors='pt').to(device) + + bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) \ + if chat_history_ids is not None \ + else new_user_input_ids + chat_history_ids = model.generate(bot_input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id).to(device) + response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True) + + print('>> Bot : ' + response) + torch.cuda.empty_cache() + diff --git a/chat_mistral.py b/chat_mistral.py new file mode 100644 index 0000000..651b847 --- /dev/null +++ b/chat_mistral.py @@ -0,0 +1,31 @@ +import atexit +import torch +from utils.conversation import load_conversation, save_conversation +from transformers import AutoModelForCausalLM, AutoTokenizer + +from utils.prompt import prompt + +device = 'cuda' # the device to load the model onto +model_id = 'mistralai/Mistral-7B-Instruct-v0.2' + +print('Loading ' + model_id) +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto') +tokenizer = AutoTokenizer.from_pretrained(model_id) +print('Loaded') +# print(tokenizer.default_chat_template) + +# read and save conversation +messages = load_conversation(model_id) +atexit.register(lambda: save_conversation(model_id, messages)) + +while True: + user_prompt = prompt('>> User: ') + messages.append({'role': 'user', 'content': user_prompt}) + + model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda") + generated_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True) + response = tokenizer.batch_decode(generated_ids)[0] + + print('>> Bot : ' + response) + messages.append({'role': 'assistant', 'content': response}) + torch.cuda.empty_cache() diff --git a/chat_qwen.py b/chat_qwen.py new file mode 100644 index 0000000..d446506 --- /dev/null +++ b/chat_qwen.py @@ -0,0 +1,55 @@ +import atexit +import os +import sys + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from utils.conversation import save_conversation_json +from utils.prompt import prompt + + +class ChatQwen: + default_device = 'cuda' # the device to load the model onto + # default_model_id = 'Qwen/Qwen1.5-0.5B-Chat' + default_model_id = 'Qwen/Qwen1.5-1.8B-Chat' + # default_model_id = 'Qwen/Qwen1.5-4B-Chat' + + default_instruction = {'role': 'system', 'content': 'Your name is "Laura". You are an AI created by Alice.'} + + def __init__(self, model_id_or_path=default_model_id): + # model_id = model_id_or_path if not load_from_disk else os.path.abspath(sys.argv[1]) + + print('Loading ' + model_id_or_path) + self.model_id_or_path = model_id_or_path + self.model = AutoModelForCausalLM.from_pretrained(model_id_or_path, torch_dtype='auto', device_map='auto') + self.tokenizer = AutoTokenizer.from_pretrained(model_id_or_path) + # print(tokenizer.default_chat_template) + # print(type(model)) + # print(type(tokenizer)) + print('Loaded') + + def generate(self, messages): + # prepare + messages = [m for m in messages if m['role'] != 'system'] + input_messages = [self.default_instruction] + messages + + # generate + text = self.tokenizer.apply_chat_template(input_messages, tokenize=False, add_generation_prompt=True) + model_inputs = self.tokenizer([text], return_tensors='pt').to(self.default_device) + generated_ids = self.model.generate(model_inputs.input_ids, max_new_tokens=100) + generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + + # add response and save conversation + messages.append({'role': 'assistant', 'content': response}) + self.record_conversation(input_messages, {'role': 'assistant', 'content': response}) + + torch.cuda.empty_cache() # clear cache or the gpu mem will be used a lot + return messages + + def record_conversation(self, messages, response): + messages = messages + [response] + save_conversation_json(self.model_id_or_path, messages) diff --git a/server.py b/server.py new file mode 100644 index 0000000..36c0b51 --- /dev/null +++ b/server.py @@ -0,0 +1,44 @@ +import http.server +import socketserver +import json +import sys + +from chat_qwen import ChatQwen + +bot = ChatQwen(sys.argv[1]) if len(sys.argv) > 1 else ChatQwen() + + +class Server(socketserver.TCPServer): + # Avoid "address already used" error when frequently restarting the script + allow_reuse_address = True + + +class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(200) + self.end_headers() + self.wfile.write("Use POST with JSON body of the format \n" + "[{\"role\": \"user\", \"content\": \"message\"}] \n" + "or \n" + "[{\"role\": \"user\", \"content\": \"message\"}, " + "{\"role\": \"assistant\", \"content\": \"message\"}, " + "{\"role\": \"user\", \"content\": \"message\"}]".encode("utf-8")) + + def do_POST(self): + try: + content_len = int(self.headers.get('Content-Length')) + post_body = self.rfile.read(content_len) + json_body = json.loads(post_body) + + response = bot.generate(json_body) + + self.send_response(200) + self.end_headers() + self.wfile.write(json.dumps(response).encode("utf-8")) + except: + self.send_response(400) + self.end_headers() + + +with Server(("0.0.0.0", 8900), Handler) as httpd: + httpd.serve_forever() diff --git a/train/prepare/helpsteer/helpsteer22jsonl.py b/train/prepare/helpsteer/helpsteer22jsonl.py new file mode 100644 index 0000000..248154b --- /dev/null +++ b/train/prepare/helpsteer/helpsteer22jsonl.py @@ -0,0 +1,27 @@ +import json +import os + +this_dir = os.path.dirname(os.path.abspath(__file__)) + + +def mkdir(path): + if not os.path.isdir(path): + os.mkdir(path) + + +mkdir(this_dir + '/../../data') +mkdir(this_dir + '/../../data/helpsteer') + +for filename in ['train.jsonl', 'validation.jsonl']: + with open(this_dir + '/' + filename, 'r') as f: + lines = f.readlines() + + role_dict = {'prompt': 'user', 'response': 'assistant'} + lines = [json.loads(line) for line in lines] + conversations = [{'messages': [{'role': 'user', 'content': line['prompt']}, + {'role': 'assistant', 'content': line['response']}]} for line in lines] + + print(conversations[0]) + + with open(this_dir + '/../../data/helpsteer/helpsteer_' + filename[0:-6] + '_all.jsonl', 'w') as f: + f.writelines([json.dumps(conv) + '\n' for conv in conversations]) diff --git a/train/prepare/oasst2/oasst22jsonl.py b/train/prepare/oasst2/oasst22jsonl.py new file mode 100644 index 0000000..6fde55b --- /dev/null +++ b/train/prepare/oasst2/oasst22jsonl.py @@ -0,0 +1,42 @@ +import json +import os + +# parsing OA data files with oasst_data helpers +from oasst_data import read_message_trees, ExportMessageNode + +messages: list[ExportMessageNode] = [] + +this_dir = os.path.dirname(os.path.abspath(__file__)) +input_file_path = this_dir + '/2023-11-05_oasst2_all.trees.jsonl.gz' + +role_dict = {'prompter': 'user', 'assistant': 'assistant'} +conversations = [] + + +def visit(node: ExportMessageNode, parents: [ExportMessageNode]): + new_parents = parents + [node] + if not node.replies: # end of conversation + conversations.append({'messages': [{'role': role_dict[p.role], 'content': p.text} for p in new_parents]}) + else: + for reply in node.replies: + visit(reply, new_parents) + + +for tree in read_message_trees(input_file_path): + if tree.prompt.lang not in ['en']: # filtering by language tag (optional) + continue + + visit(tree.prompt, []) + +print(conversations[0]) + + +def mkdir(path): + if not os.path.isdir(path): + os.mkdir(path) + + +mkdir(this_dir + '/../../data') +mkdir(this_dir + '/../../data/oasst') +with open(this_dir + '/../../data/oasst/oasst_all.jsonl', 'w') as f: + f.writelines([json.dumps(conv) + '\n' for conv in conversations]) diff --git a/train/sft/README.md b/train/sft/README.md new file mode 100644 index 0000000..fefbb7d --- /dev/null +++ b/train/sft/README.md @@ -0,0 +1,3 @@ +# Documentation +see https://github.com/QwenLM/Qwen1.5/blob/main/docs/source/training/SFT/example.rst +or [./example.rst]() \ No newline at end of file diff --git a/train/sft/ds_config_zero2.json b/train/sft/ds_config_zero2.json new file mode 100644 index 0000000..4be2c0b --- /dev/null +++ b/train/sft/ds_config_zero2.json @@ -0,0 +1,52 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 100, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/train/sft/ds_config_zero3.json b/train/sft/ds_config_zero3.json new file mode 100644 index 0000000..4e58ce0 --- /dev/null +++ b/train/sft/ds_config_zero3.json @@ -0,0 +1,59 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 100, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/train/sft/example.rst b/train/sft/example.rst new file mode 100644 index 0000000..d550f45 --- /dev/null +++ b/train/sft/example.rst @@ -0,0 +1,572 @@ +Example +==================================================== + +Here we provide a very simple script for supervised finetuning, which is revised from the training +script in ```Fastchat`` `__. The +script is used to finetune Qwen with Hugging Face Trainer. You can check +the script +`here `__. This +script for supervised finetuning (SFT) has the following features: + +- Support single-GPU and multi-GPU training; +- Support full-parameter tuning, + `LoRA `__, and + `Q-LoRA `__. + +In the following, we introduce more details about the usage of the +script. + +Installation +------------ + +Before you start, make sure you have installed the following packages: + +.. code:: bash + + pip install peft deepspeed optimum accelerate + +Data Preparation +---------------- + +For data preparation, we advise you to organize the data in a jsonl +file, where each line is a dictionary as demonstrated below: + +.. code:: json + + { + "type": "chatml", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Tell me something about large language models." + }, + { + "role": "assistant", + "content": "Large language models are a type of language model that is trained on a large corpus of text data. They are capable of generating human-like text and are used in a variety of natural language processing tasks..." + } + ], + "source": "unknown" + } + +.. code:: json + + { + "type": "chatml", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "What is your name?" + }, + { + "role": "assistant", + "content": "My name is Qwen." + } + ], + "source": "self-made" + } + +Above are two examples of each data sample in the dataset. Each sample +is a JSON object with the following fields: ``type``, ``messages`` and +``source``. ``messages`` is required while the others are optional for +you to label your data format and data source. The ``messages`` field is +a list of JSON objects, each of which has two fields: ``role`` and +``content``. ``role`` can be ``system``, ``user``, or ``assistant``. +``content`` is the text of the message. ``source`` is the source of the +data, which can be ``self-made``, ``alpaca``, ``open-hermes``, or any +other string. + +To make the jsonl file, you can use ``json`` to save a list of +dictionaries to the jsonl file: + +.. code:: python + + import json + + with open('data.jsonl', 'w') as f: + for sample in samples: + f.write(json.dumps(sample) + '\n') + +Quickstart +---------- + +For you to start finetuning quickly, we directly provide a shell script +for you to run without paying attention to details. You need +different hyperparameters for different types of training, e.g., +single-GPU / multi-GPU training, full-parameter tuning, LoRA, or Q-LoRA. + + + +.. code:: bash + + cd examples/sft + bash finetune.sh -m -d --deepspeed [--use_lora True] [--q_lora True] + + +Specify the ```` for your model, ```` for your +data, and ```` for your deepspeed configuration. +If you use LoRA or Q-LoRA, just add ``--use_lora True`` or +``--q_lora True`` based on your requirements. +This is the simplest way to start finetuning. If you want to change more +hyperparameters, you can dive into the script and modify those +parameters. + +Advanced Usages +--------------- + +In this section, we introduce the details of the scripts, including the +core python script as well as the corresponding shell script. + +Shell Script +~~~~~~~~~~~~~ + +Before we introduce the python code, we provide a brief introduction to +the shell script with commands. We provide some guidance inside the +shell script and here we take ``finetune.sh`` as an example. + +To set up the environment variables for distributed training (or +single-GPU training), specify the following variables: +``GPUS_PER_NODE``, ``NNODES``, ``NODE_RANK``, ``MASTER_ADDR``, and +``MASTER_PORT``. No need to worry too much about them as we provide the +default settings for you. In the command, you can pass in the argument +``-m`` and ``-d`` to specify the model path and data path, respectively. +You can also pass in the argument ``--deepspeed`` to specify the +deepspeed configuration file. We provide two configuration files for +ZeRO2 and ZeRO3, and you can choose one based on your requirements. In +most cases, we recommend using ZeRO3 for multi-GPU training except for +Q-LoRA, where we recommend using ZeRO2. + +There are a series of hyperparameters to tune. Passing in ``--bf16`` or +``--fp16`` to specify the precision for mixed precision training. +The other significant hyperparameters include: + +- ``--output_dir``: the path of your output models or adapters. +- ``--num_train_epochs``: the number of training epochs. +- ``--gradient_accumulation_steps``: the number of gradient + accumulation steps. +- ``--per_device_train_batch_size``: the batch size per GPU for + training, and the total batch size is equalt to + ``per_device_train_batch_size`` :math:`\times` ``number_of_gpus`` + :math:`\times` ``gradient_accumulation_steps``. +- ``--learning_rate``: the learning rate. +- ``--warmup_steps``: the number of warmup steps. +- ``--lr_scheduler_type``: the type of learning rate scheduler. +- ``--weight_decay``: the value of weight decay. +- ``--adam_beta2``: the value of :math:`\beta_2` in Adam. +- ``--model_max_length``: the maximum sequence length. +- ``--use_lora``: whether to use LoRA. Adding ``--q_lora`` can enable + Q-LoRA. +- ``--gradient_checkpointing``: whether to use gradient checkpointing. + +Python Script +~~~~~~~~~~~~~ + +In this script, we mainly use ``trainer`` from HF and ``peft`` to train +our models. We also use ``deepspeed`` to accelerate the training +process. The script is very simple and easy to understand. + +.. code:: python + + @dataclass + @dataclass + class ModelArguments: + model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B") + + + @dataclass + class DataArguments: + data_path: str = field( + default=None, metadata={"help": "Path to the training data."} + ) + eval_data_path: str = field( + default=None, metadata={"help": "Path to the evaluation data."} + ) + lazy_preprocess: bool = False + + + @dataclass + class TrainingArguments(transformers.TrainingArguments): + cache_dir: Optional[str] = field(default=None) + optim: str = field(default="adamw_torch") + model_max_length: int = field( + default=8192, + metadata={ + "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." + }, + ) + use_lora: bool = False + + + @dataclass + class LoraArguments: + lora_r: int = 64 + lora_alpha: int = 16 + lora_dropout: float = 0.05 + lora_target_modules: List[str] = field( + default_factory=lambda: [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "up_proj", + "gate_proj", + "down_proj", + ] + ) + lora_weight_path: str = "" + lora_bias: str = "none" + q_lora: bool = False + +The classes for arguments allow you to specify hyperparameters for +model, data, training, and additionally LoRA if you use LoRA or Q-LoRA +to train your model. Specifically, ``model-max-length`` is a key +hyperparameter that determines your maximum sequence length of your +training data. + +``LoRAArguments`` includes the hyperparameters for LoRA or Q-LoRA: + +- ``lora_r``: the rank for LoRA; +- ``lora_alpha``: the alpha value for LoRA; +- ``lora_dropout``: the dropout rate for LoRA; +- ``lora_target_modules``: the target modules for LoRA. By default we + tune all linear layers; +- ``lora_weight_path``: the path to the weight file for LoRA; +- ``lora_bias``: the bias for LoRA; +- ``q_lora``: whether to use Q-LoRA. + + +.. code:: python + + def maybe_zero_3(param): + if hasattr(param, "ds_id"): + assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + + # Borrowed from peft.utils.get_peft_model_state_dict + def get_peft_state_maybe_zero_3(named_params, bias): + if bias == "none": + to_return = {k: t for k, t in named_params if "lora_" in k} + elif bias == "all": + to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} + elif bias == "lora_only": + to_return = {} + maybe_lora_bias = {} + lora_bias_names = set() + for k, t in named_params: + if "lora_" in k: + to_return[k] = t + bias_name = k.split("lora_")[0] + "bias" + lora_bias_names.add(bias_name) + elif "bias" in k: + maybe_lora_bias[k] = t + for k, t in maybe_lora_bias: + if bias_name in lora_bias_names: + to_return[bias_name] = t + else: + raise NotImplementedError + to_return = {k: maybe_zero_3(v) for k, v in to_return.items()} + return to_return + + + def safe_save_model_for_hf_trainer( + trainer: transformers.Trainer, output_dir: str, bias="none" + ): + """Collects the state dict and dump to disk.""" + # check if zero3 mode enabled + if deepspeed.is_deepspeed_zero3_enabled(): + state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict() + else: + if trainer.args.use_lora: + state_dict = get_peft_state_maybe_zero_3( + trainer.model.named_parameters(), bias + ) + else: + state_dict = trainer.model.state_dict() + if trainer.args.should_save and trainer.args.local_rank == 0: + trainer._save(output_dir, state_dict=state_dict) + +The method ``safe_save_model_for_hf_trainer``, which uses +``get_peft_state_maybe_zero_3``, helps tackle the problems in saving +models trained either with or without ZeRO3. + +.. code:: python + + def preprocess( + messages, + tokenizer: transformers.PreTrainedTokenizer, + max_len: int, + ) -> Dict: + """Preprocesses the data for supervised fine-tuning.""" + + texts = [] + for i, msg in enumerate(messages): + texts.append( + tokenizer.apply_chat_template( + msg, + tokenize=True, + add_generation_prompt=False, + padding=True, + max_length=max_len, + truncation=True, + ) + ) + input_ids = torch.tensor(texts, dtype=torch.int) + target_ids = input_ids.clone() + target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID + attention_mask = input_ids.ne(tokenizer.pad_token_id) + + return dict( + input_ids=input_ids, target_ids=target_ids, attention_mask=attention_mask + ) + +For data preprocessing, we use ``preprocess`` to organize the data. +Specifically, we apply our ChatML template to the texts. If you prefer +other chat templates, you can use others, e.g., by still applying +``apply_chat_template()`` with another tokenizer. The chat template is +stored in the ``tokenizer_config.json`` in the HF repo. Additionally, we +pad the sequence of each sample to the maximum length for training. + +.. code:: python + + class SupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__( + self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int + ): + super(SupervisedDataset, self).__init__() + + rank0_print("Formatting inputs...") + messages = [example["messages"] for example in raw_data] + data_dict = preprocess(messages, tokenizer, max_len) + + self.input_ids = data_dict["input_ids"] + self.target_ids = data_dict["target_ids"] + self.attention_mask = data_dict["attention_mask"] + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + return dict( + input_ids=self.input_ids[i], + labels=self.labels[i], + attention_mask=self.attention_mask[i], + ) + + + class LazySupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__( + self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int + ): + super(LazySupervisedDataset, self).__init__() + self.tokenizer = tokenizer + self.max_len = max_len + + rank0_print("Formatting inputs...Skip in lazy mode") + self.tokenizer = tokenizer + self.raw_data = raw_data + self.cached_data_dict = {} + + def __len__(self): + return len(self.raw_data) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + if i in self.cached_data_dict: + return self.cached_data_dict[i] + + ret = preprocess([self.raw_data[i]["messages"]], self.tokenizer, self.max_len) + ret = dict( + input_ids=ret["input_ids"][0], + labels=ret["target_ids"][0], + attention_mask=ret["attention_mask"][0], + ) + self.cached_data_dict[i] = ret + + return ret + + + def make_supervised_data_module( + tokenizer: transformers.PreTrainedTokenizer, + data_args, + max_len, + ) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + dataset_cls = ( + LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset + ) + rank0_print("Loading data...") + + train_data = [] + with open(data_args.data_path, "r") as f: + for line in f: + train_data.append(json.loads(line)) + train_dataset = dataset_cls(train_data, tokenizer=tokenizer, max_len=max_len) + + if data_args.eval_data_path: + eval_data = [] + with open(data_args.eval_data_path, "r") as f: + for line in f: + eval_data.append(json.loads(line)) + eval_dataset = dataset_cls(eval_data, tokenizer=tokenizer, max_len=max_len) + else: + eval_dataset = None + + return dict(train_dataset=train_dataset, eval_dataset=eval_dataset) + +Then we utilize ``make_supervised_data_module`` by using +``SupervisedDataset`` or ``LazySupervisedDataset`` to build the dataset. + +.. code:: python + + def train(): + global local_rank + + parser = transformers.HfArgumentParser( + (ModelArguments, DataArguments, TrainingArguments, LoraArguments) + ) + ( + model_args, + data_args, + training_args, + lora_args, + ) = parser.parse_args_into_dataclasses() + + # This serves for single-gpu qlora. + if ( + getattr(training_args, "deepspeed", None) + and int(os.environ.get("WORLD_SIZE", 1)) == 1 + ): + training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED + + local_rank = training_args.local_rank + + device_map = None + world_size = int(os.environ.get("WORLD_SIZE", 1)) + ddp = world_size != 1 + if lora_args.q_lora: + device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto" + if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled(): + logging.warning("FSDP or ZeRO3 is incompatible with QLoRA.") + + model_load_kwargs = { + "low_cpu_mem_usage": not deepspeed.is_deepspeed_zero3_enabled(), + } + + compute_dtype = ( + torch.float16 + if training_args.fp16 + else (torch.bfloat16 if training_args.bf16 else torch.float32) + ) + + # Load model and tokenizer + config = transformers.AutoConfig.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + ) + config.use_cache = False + + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + config=config, + cache_dir=training_args.cache_dir, + device_map=device_map, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=compute_dtype, + ) + if training_args.use_lora and lora_args.q_lora + else None, + **model_load_kwargs, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right", + use_fast=False, + ) + + if training_args.use_lora: + lora_config = LoraConfig( + r=lora_args.lora_r, + lora_alpha=lora_args.lora_alpha, + target_modules=lora_args.lora_target_modules, + lora_dropout=lora_args.lora_dropout, + bias=lora_args.lora_bias, + task_type="CAUSAL_LM", + ) + if lora_args.q_lora: + model = prepare_model_for_kbit_training( + model, use_gradient_checkpointing=training_args.gradient_checkpointing + ) + + model = get_peft_model(model, lora_config) + + # Print peft trainable params + model.print_trainable_parameters() + + if training_args.gradient_checkpointing: + model.enable_input_require_grads() + + # Load data + data_module = make_supervised_data_module( + tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length + ) + + # Start trainer + trainer = Trainer( + model=model, tokenizer=tokenizer, args=training_args, **data_module + ) + + # `not training_args.use_lora` is a temporary workaround for the issue that there are problems with + # loading the checkpoint when using LoRA with DeepSpeed. + # Check this issue https://github.com/huggingface/peft/issues/746 for more information. + if ( + list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")) + and not training_args.use_lora + ): + trainer.train(resume_from_checkpoint=True) + else: + trainer.train() + trainer.save_state() + + safe_save_model_for_hf_trainer( + trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias + ) + +The ``train`` method is the key to the training. In general, it loads +the tokenizer and model with ``AutoTokenizer.from_pretrained()`` and +``AutoModelForCausalLM.from_pretrained()``. If we use LoRA, the method +will initialize LoRA configuration with ``LoraConfig``. If we apply +Q-LoRA, we should use ``prepare_model_for_kbit_training``. Note that for +now it still does not support resume for LoRA. Then we leave the +following efforts to ``trainer`` and have a cup of coffee! + +Next Step +--------- + +Now, you are able to use a very simple script to perform different types +of SFT. Alternatively, you can use more advanced training libraries, +such as +`Axolotl `__ or +`LLaMA-Factory `__, to enjoy +more functionalities. To take a step forward, after SFT, you can +consider RLHF to align your model to human preferences! Stay tuned for +our next tutorial on RLHF! diff --git a/train/sft/finetune.py b/train/sft/finetune.py new file mode 100644 index 0000000..4dce64e --- /dev/null +++ b/train/sft/finetune.py @@ -0,0 +1,378 @@ +# This code is based on the revised code from fastchat based on tatsu-lab/stanford_alpaca. + + +from dataclasses import dataclass, field +import json +import logging +import os +import pathlib +from typing import Dict, Optional, List +import torch +from torch.utils.data import Dataset +from deepspeed import zero +from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint +from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import Trainer, BitsAndBytesConfig, deepspeed +from transformers.trainer_pt_utils import LabelSmoother +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training +from accelerate.utils import DistributedType + + +IGNORE_TOKEN_ID = LabelSmoother.ignore_index + +TEMPLATE = "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{ '<|im_end|>'}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}" + +local_rank = None + + +def rank0_print(*args): + if local_rank == 0: + print(*args) + + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B") + + +@dataclass +class DataArguments: + data_path: str = field( + default=None, metadata={"help": "Path to the training data."} + ) + eval_data_path: str = field( + default=None, metadata={"help": "Path to the evaluation data."} + ) + lazy_preprocess: bool = False + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + cache_dir: Optional[str] = field(default=None) + optim: str = field(default="adamw_torch") + model_max_length: int = field( + default=8192, + metadata={ + "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." + }, + ) + use_lora: bool = False + + +@dataclass +class LoraArguments: + lora_r: int = 64 + lora_alpha: int = 16 + lora_dropout: float = 0.05 + lora_target_modules: List[str] = field( + default_factory=lambda: [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "up_proj", + "gate_proj", + "down_proj", + ] + ) + lora_weight_path: str = "" + lora_bias: str = "none" + q_lora: bool = False + + +def maybe_zero_3(param): + if hasattr(param, "ds_id"): + assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +# Borrowed from peft.utils.get_peft_model_state_dict +def get_peft_state_maybe_zero_3(named_params, bias): + if bias == "none": + to_return = {k: t for k, t in named_params if "lora_" in k} + elif bias == "all": + to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} + elif bias == "lora_only": + to_return = {} + maybe_lora_bias = {} + lora_bias_names = set() + for k, t in named_params: + if "lora_" in k: + to_return[k] = t + bias_name = k.split("lora_")[0] + "bias" + lora_bias_names.add(bias_name) + elif "bias" in k: + maybe_lora_bias[k] = t + for k, t in maybe_lora_bias: + if bias_name in lora_bias_names: + to_return[bias_name] = t + else: + raise NotImplementedError + to_return = {k: maybe_zero_3(v) for k, v in to_return.items()} + return to_return + + +def safe_save_model_for_hf_trainer( + trainer: transformers.Trainer, output_dir: str, bias="none" +): + """Collects the state dict and dump to disk.""" + # check if zero3 mode enabled + if deepspeed.is_deepspeed_zero3_enabled(): + state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict() + else: + if trainer.args.use_lora: + state_dict = get_peft_state_maybe_zero_3( + trainer.model.named_parameters(), bias + ) + else: + state_dict = trainer.model.state_dict() + if trainer.args.should_save and trainer.args.local_rank == 0: + trainer._save(output_dir, state_dict=state_dict) + + +def preprocess( + messages, + tokenizer: transformers.PreTrainedTokenizer, + max_len: int, +) -> Dict: + """Preprocesses the data for supervised fine-tuning.""" + + texts = [] + for i, msg in enumerate(messages): + texts.append( + tokenizer.apply_chat_template( + msg, + chat_template=TEMPLATE, + tokenize=True, + add_generation_prompt=False, + padding=True, + max_length=max_len, + truncation=True, + ) + ) + input_ids = torch.tensor(texts, dtype=torch.int) + target_ids = input_ids.clone() + target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID + attention_mask = input_ids.ne(tokenizer.pad_token_id) + + return dict( + input_ids=input_ids, target_ids=target_ids, attention_mask=attention_mask + ) + + +class SupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__( + self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int + ): + super(SupervisedDataset, self).__init__() + + rank0_print("Formatting inputs...") + messages = [example["messages"] for example in raw_data] + data_dict = preprocess(messages, tokenizer, max_len) + + self.input_ids = data_dict["input_ids"] + self.target_ids = data_dict["target_ids"] + self.attention_mask = data_dict["attention_mask"] + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + return dict( + input_ids=self.input_ids[i], + labels=self.target_ids[i], + attention_mask=self.attention_mask[i], + ) + + +class LazySupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__( + self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int + ): + super(LazySupervisedDataset, self).__init__() + self.tokenizer = tokenizer + self.max_len = max_len + + rank0_print("Formatting inputs...Skip in lazy mode") + self.tokenizer = tokenizer + self.raw_data = raw_data + self.cached_data_dict = {} + + def __len__(self): + return len(self.raw_data) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + if i in self.cached_data_dict: + return self.cached_data_dict[i] + + ret = preprocess([self.raw_data[i]["messages"]], self.tokenizer, self.max_len) + ret = dict( + input_ids=ret["input_ids"][0], + labels=ret["target_ids"][0], + attention_mask=ret["attention_mask"][0], + ) + self.cached_data_dict[i] = ret + + return ret + + +def make_supervised_data_module( + tokenizer: transformers.PreTrainedTokenizer, + data_args, + max_len, +) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + dataset_cls = ( + LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset + ) + rank0_print("Loading data...") + + train_data = [] + with open(data_args.data_path, "r") as f: + for line in f: + train_data.append(json.loads(line)) + train_dataset = dataset_cls(train_data, tokenizer=tokenizer, max_len=max_len) + + if data_args.eval_data_path: + eval_data = [] + with open(data_args.eval_data_path, "r") as f: + for line in f: + eval_data.append(json.loads(line)) + eval_dataset = dataset_cls(eval_data, tokenizer=tokenizer, max_len=max_len) + else: + eval_dataset = None + + return dict(train_dataset=train_dataset, eval_dataset=eval_dataset) + + +def train(): + global local_rank + + parser = transformers.HfArgumentParser( + (ModelArguments, DataArguments, TrainingArguments, LoraArguments) + ) + ( + model_args, + data_args, + training_args, + lora_args, + ) = parser.parse_args_into_dataclasses() + + # This serves for single-gpu qlora. + if ( + getattr(training_args, "deepspeed", None) + and int(os.environ.get("WORLD_SIZE", 1)) == 1 + ): + training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED + + local_rank = training_args.local_rank + + device_map = None + world_size = int(os.environ.get("WORLD_SIZE", 1)) + ddp = world_size != 1 + if lora_args.q_lora: + device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto" + if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled(): + logging.warning("FSDP or ZeRO3 is incompatible with QLoRA.") + + model_load_kwargs = { + "low_cpu_mem_usage": not deepspeed.is_deepspeed_zero3_enabled(), + } + + compute_dtype = ( + torch.float16 + if training_args.fp16 + else (torch.bfloat16 if training_args.bf16 else torch.float32) + ) + + # Load model and tokenizer + config = transformers.AutoConfig.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + ) + config.use_cache = False + + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + config=config, + cache_dir=training_args.cache_dir, + device_map=device_map, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=compute_dtype, + ) + if training_args.use_lora and lora_args.q_lora + else None, + **model_load_kwargs, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right", + use_fast=False, + ) + + if training_args.use_lora: + lora_config = LoraConfig( + r=lora_args.lora_r, + lora_alpha=lora_args.lora_alpha, + target_modules=lora_args.lora_target_modules, + lora_dropout=lora_args.lora_dropout, + bias=lora_args.lora_bias, + task_type="CAUSAL_LM", + ) + if lora_args.q_lora: + model = prepare_model_for_kbit_training( + model, use_gradient_checkpointing=training_args.gradient_checkpointing + ) + + model = get_peft_model(model, lora_config) + + # Print peft trainable params + model.print_trainable_parameters() + + if training_args.gradient_checkpointing: + model.enable_input_require_grads() + + # Load data + data_module = make_supervised_data_module( + tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length + ) + + # Start trainer + trainer = Trainer( + model=model, tokenizer=tokenizer, args=training_args, **data_module + ) + + # `not training_args.use_lora` is a temporary workaround for the issue that there are problems with + # loading the checkpoint when using LoRA with DeepSpeed. + # Check this issue https://github.com/huggingface/peft/issues/746 for more information. + if ( + list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")) + and not training_args.use_lora + ): + trainer.train(resume_from_checkpoint=True) + else: + trainer.train() + trainer.save_state() + + safe_save_model_for_hf_trainer( + trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias + ) + + +if __name__ == "__main__": + train() \ No newline at end of file diff --git a/train/sft/finetune.sh b/train/sft/finetune.sh new file mode 100755 index 0000000..f82bcfd --- /dev/null +++ b/train/sft/finetune.sh @@ -0,0 +1,107 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 +DIR=`pwd` + +# Guide: +# This script supports distributed training on multi-gpu workers (as well as single-worker training). +# Please set the options below according to the comments. +# For multi-gpu workers training, these options should be manually set for each worker. +# After setting the options, please run the script on each worker. + +# Number of GPUs per GPU worker +GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())') + +# Number of GPU workers, for single-worker training, please set to 1 +NNODES=${NNODES:-1} + +# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0 +NODE_RANK=${NODE_RANK:-0} + +# The ip address of the rank-0 worker, for single-worker training, please set to localhost +MASTER_ADDR=${MASTER_ADDR:-localhost} + +# The port for communication +MASTER_PORT=${MASTER_PORT:-6001} + +MODEL="Qwen/Qwen1.5-7B" # Set the path if you do not want to load from huggingface directly +# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. +# See the section for finetuning in README for more information. +DATA="path_to_data" +DS_CONFIG_PATH="ds_config_zero3.json" +USE_LORA=False +Q_LORA=False + +function usage() { + echo ' +Usage: bash finetune/finetune_lora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] [--deepspeed DS_CONFIG_PATH] [--use_lora USE_LORA] [--q_lora Q_LORA] +' +} + +while [[ "$1" != "" ]]; do + case $1 in + -m | --model ) + shift + MODEL=$1 + ;; + -d | --data ) + shift + DATA=$1 + ;; + --deepspeed ) + shift + DS_CONFIG_PATH=$1 + ;; + --use_lora ) + shift + USE_LORA=$1 + ;; + --q_lora ) + shift + Q_LORA=$1 + ;; + -h | --help ) + usage + exit 0 + ;; + * ) + echo "Unknown argument ${1}" + exit 1 + ;; + esac + shift +done + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +torchrun $DISTRIBUTED_ARGS finetune.py \ + --model_name_or_path $MODEL \ + --data_path $DATA \ + --bf16 False \ + --output_dir output_qwen \ + --num_train_epochs 5 \ + --per_device_train_batch_size 2 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 10 \ + --save_total_limit 10 \ + --learning_rate 3e-4 \ + --weight_decay 0.01 \ + --adam_beta2 0.95 \ + --warmup_ratio 0.01 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --report_to "none" \ + --model_max_length 512 \ + --lazy_preprocess True \ + --use_lora ${USE_LORA} \ + --q_lora ${Q_LORA} \ + --gradient_checkpointing \ + --deepspeed ${DS_CONFIG_PATH} \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/conversation.py b/utils/conversation.py new file mode 100644 index 0000000..4dd0808 --- /dev/null +++ b/utils/conversation.py @@ -0,0 +1,61 @@ +import datetime +import json +import os +import pickle + + +def load_conversation(model_id): + folder = 'conversations/' + model_id.replace('/', '_') + mkdir('../conversations') + mkdir(folder) + + files = os.listdir(folder) + files = [file for file in files if file.endswith(".pickle") and os.path.isfile(folder + '/' + file)] + files.sort(reverse=True) + if len(files) > 0: + pickle_filename = folder + '/' + files[0] + print('Loading last conversation from ' + pickle_filename) + with open(pickle_filename, 'rb') as file: + return pickle.load(file) + return [] + + +def save_conversation(model_id, messages): + folder = 'conversations/' + model_id.replace('/', '_') + mkdir('../conversations') + mkdir(folder) + timestamp = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S') + pickle_filename = folder + '/' + timestamp + '.pickle' + with open(pickle_filename, 'wb') as file: + pickle.dump(messages, file) + + +def load_conversation_json(model_id): + folder = 'conversations/' + model_id.replace('/', '_') + mkdir('../conversations') + mkdir(folder) + + files = os.listdir(folder) + files = [file for file in files if file.endswith(".json") and os.path.isfile(folder + '/' + file)] + files.sort(reverse=True) + if len(files) > 0: + pickle_filename = folder + '/' + files[0] + print('Loading last conversation from ' + pickle_filename) + with open(pickle_filename, 'r') as file: + return json.load(file) + return [] + + +def save_conversation_json(model_id, messages): + folder = 'conversations/' + model_id.replace('/', '_') + mkdir('../conversations') + mkdir(folder) + timestamp = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S') + pickle_filename = folder + '/' + timestamp + '.json' + with open(pickle_filename, 'w') as file: + json.dump(messages, file) + + +def mkdir(path): + if not os.path.isdir(path): + os.mkdir(path) diff --git a/utils/download_dataset.py b/utils/download_dataset.py new file mode 100644 index 0000000..f017778 --- /dev/null +++ b/utils/download_dataset.py @@ -0,0 +1,12 @@ +import pickle +from conversation import mkdir +from datasets import load_dataset + +dataset_id = 'OpenAssistant/oasst2' + +mkdir('../datasets') +pickle_filename = './datasets/' + dataset_id.replace('/', '_') + '.pickle' +dataset = load_dataset(dataset_id) +with open(pickle_filename, 'wb') as file: + pickle.dump(dataset, file) +print('Saved as pickle to ' + pickle_filename) diff --git a/utils/download_model.py b/utils/download_model.py new file mode 100644 index 0000000..ad9828a --- /dev/null +++ b/utils/download_model.py @@ -0,0 +1,18 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer + +from conversation import mkdir + +model_id = 'Qwen/Qwen1.5-0.5B-Chat' +# model_id = 'Qwen/Qwen1.5-1.8B-Chat' + +print('Downloading ' + model_id) +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto') +tokenizer = AutoTokenizer.from_pretrained(model_id) +print('Downloaded') + +trainer = Trainer( + model=model, + tokenizer=tokenizer, +) +mkdir('models') +trainer.save_model('./models/' + model_id.replace('/', '_')) diff --git a/utils/fix_cuda.sh b/utils/fix_cuda.sh new file mode 100755 index 0000000..760f03d --- /dev/null +++ b/utils/fix_cuda.sh @@ -0,0 +1,3 @@ +#https://medium.com/@Spritan/dealing-with-cuda-initialization-error-aa7c88d021e4 +sudo rmmod nvidia_uvm +sudo modprobe nvidia_uvm \ No newline at end of file diff --git a/utils/pickle2json.py b/utils/pickle2json.py new file mode 100644 index 0000000..9d99251 --- /dev/null +++ b/utils/pickle2json.py @@ -0,0 +1,22 @@ +import json +import pickle +import sys + +files = sys.argv[1:] +print(files) + +for pickle_filename in files: + if not pickle_filename.endswith('.pickle'): + print(pickle_filename + ' is not a pickle. ignoring') + continue + + with open(pickle_filename, 'rb') as file: + obj = pickle.load(file) + print(obj) + + json_filename = pickle_filename[0:-6] + 'json' + try: + with open(json_filename, 'w') as file: + json.dump(obj, file) + except Exception as e: + print(e) diff --git a/utils/prompt.py b/utils/prompt.py new file mode 100644 index 0000000..60731ba --- /dev/null +++ b/utils/prompt.py @@ -0,0 +1,14 @@ +def prompt(prompt): + while True: + try: + return input(prompt) + except EOFError: + print() + exit(0) + except KeyboardInterrupt: + print() + exit(0) + # in case: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 11: invalid continuation byte + except UnicodeDecodeError as e: + print(e) + print('prompt ignored') diff --git a/utils/split_shuffle_dataset.py b/utils/split_shuffle_dataset.py new file mode 100644 index 0000000..64f8e1d --- /dev/null +++ b/utils/split_shuffle_dataset.py @@ -0,0 +1,25 @@ +import json +import os +import random +import sys + +original = sys.argv[1] +no_dataset = int(sys.argv[2]) + +if not original.endswith('.jsonl') or not os.path.isfile(original): + print('Not a jsonl file') + exit(1) + +out_dir = os.path.dirname(os.path.abspath(original)) + +with open(original, 'r') as f: + lines = f.readlines() + +random.shuffle(lines) + +for i in range(no_dataset): + l = int(i * len(lines) / no_dataset) + u = int((i + 1) * len(lines) / no_dataset) + out_filename = os.path.basename(original)[0:-6].replace('_all', '_' + str(i)) + '.jsonl' + with open(out_dir + '/' + out_filename, 'w') as f: + f.writelines(lines[l:u])