initial commit

2024-04-17 18:58:50 +02:00
commit 41d5b4f591
22 changed files with 1611 additions and 0 deletions
--- a/chat.py
+++ b/chat.py
@@ -0,0 +1,16 @@
 import requests
 from utils.prompt import prompt
 messages = []
 while True:
    user_prompt = prompt('>> User: ')
    messages.append({'role': 'user', 'content': user_prompt})
    response = requests.post('http://localhost:8900/', json=messages)
    if response.status_code == 200:
        messages = response.json()
        print('>> Bot : ' + messages[-1]['content'])
    else:
        messages = messages[0:-1]
        print('### Error from backend')
--- a/chat_dialogpt.py
+++ b/chat_dialogpt.py
@@ -0,0 +1,36 @@
 import atexit
 import torch
 from utils.conversation import save_conversation
 from utils.prompt import prompt
 from transformers import AutoModelForCausalLM, AutoTokenizer
 device = 'cuda'  # the device to load the model onto
 model_id = 'microsoft/DialoGPT-medium'
 print('Loading ' + model_id)
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto')
 # model = AutoModelForCausalLM.from_pretrained(model_id)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 print('Loaded')
 # print(tokenizer.default_chat_template)
 # read and save conversation
 chat_history_ids = None
 # messages = load_conversation(model_id)
 atexit.register(lambda: save_conversation(model_id, bot_input_ids))
 # messages.append({'role': 'system', 'content': 'Your name is "Laura". You are an AI created by Alice.'})
 while True:
    user_prompt = prompt('>> User: ')
    new_user_input_ids = tokenizer.encode(user_prompt + tokenizer.eos_token, return_tensors='pt').to(device)
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) \
        if chat_history_ids is not None \
        else new_user_input_ids
    chat_history_ids = model.generate(bot_input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id).to(device)
    response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print('>> Bot : ' + response)
    torch.cuda.empty_cache()
--- a/chat_gpt2.py
+++ b/chat_gpt2.py
@@ -0,0 +1,34 @@
 import atexit
 import torch
 from utils.conversation import save_conversation
 from utils.prompt import prompt
 from transformers import AutoModelForCausalLM, AutoTokenizer
 device = 'cuda'  # the device to load the model onto
 model_id = 'gpt2'
 print('Loading ' + model_id)
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto')
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 print('Loaded')
 # print(tokenizer.default_chat_template)
 # read and save conversation
 chat_history_ids = None
 # messages = load_conversation(model_id)
 atexit.register(lambda: save_conversation(model_id, bot_input_ids))
 # messages.append({'role': 'system', 'content': 'Your name is "Laura". You are an AI created by Alice.'})
 while True:
    user_prompt = prompt('>> User: ')
    new_user_input_ids = tokenizer.encode(user_prompt + tokenizer.eos_token, return_tensors='pt').to(device)
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) \
        if chat_history_ids is not None \
        else new_user_input_ids
    chat_history_ids = model.generate(bot_input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id).to(device)
    response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print('>> Bot : ' + response)
    torch.cuda.empty_cache()
--- a/chat_mistral.py
+++ b/chat_mistral.py
@@ -0,0 +1,31 @@
 import atexit
 import torch
 from utils.conversation import load_conversation, save_conversation
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from utils.prompt import prompt
 device = 'cuda'  # the device to load the model onto
 model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
 print('Loading ' + model_id)
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto')
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 print('Loaded')
 # print(tokenizer.default_chat_template)
 # read and save conversation
 messages = load_conversation(model_id)
 atexit.register(lambda: save_conversation(model_id, messages))
 while True:
    user_prompt = prompt('>> User: ')
    messages.append({'role': 'user', 'content': user_prompt})
    model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    generated_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True)
    response = tokenizer.batch_decode(generated_ids)[0]
    print('>> Bot : ' + response)
    messages.append({'role': 'assistant', 'content': response})
    torch.cuda.empty_cache()
--- a/chat_qwen.py
+++ b/chat_qwen.py
@@ -0,0 +1,55 @@
 import atexit
 import os
 import sys
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from utils.conversation import save_conversation_json
 from utils.prompt import prompt
 class ChatQwen:
    default_device = 'cuda'  # the device to load the model onto
    # default_model_id = 'Qwen/Qwen1.5-0.5B-Chat'
    default_model_id = 'Qwen/Qwen1.5-1.8B-Chat'
    # default_model_id = 'Qwen/Qwen1.5-4B-Chat'
    default_instruction = {'role': 'system', 'content': 'Your name is "Laura". You are an AI created by Alice.'}
    def __init__(self, model_id_or_path=default_model_id):
        # model_id = model_id_or_path if not load_from_disk else os.path.abspath(sys.argv[1])
        print('Loading ' + model_id_or_path)
        self.model_id_or_path = model_id_or_path
        self.model = AutoModelForCausalLM.from_pretrained(model_id_or_path, torch_dtype='auto', device_map='auto')
        self.tokenizer = AutoTokenizer.from_pretrained(model_id_or_path)
        # print(tokenizer.default_chat_template)
        # print(type(model))
        # print(type(tokenizer))
        print('Loaded')
    def generate(self, messages):
        # prepare
        messages = [m for m in messages if m['role'] != 'system']
        input_messages = [self.default_instruction] + messages
        # generate
        text = self.tokenizer.apply_chat_template(input_messages, tokenize=False, add_generation_prompt=True)
        model_inputs = self.tokenizer([text], return_tensors='pt').to(self.default_device)
        generated_ids = self.model.generate(model_inputs.input_ids, max_new_tokens=100)
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        # add response and save conversation
        messages.append({'role': 'assistant', 'content': response})
        self.record_conversation(input_messages, {'role': 'assistant', 'content': response})
        torch.cuda.empty_cache()  # clear cache or the gpu mem will be used a lot
        return messages
    def record_conversation(self, messages, response):
        messages = messages + [response]
        save_conversation_json(self.model_id_or_path, messages)
--- a/server.py
+++ b/server.py
@@ -0,0 +1,44 @@
 import http.server
 import socketserver
 import json
 import sys
 from chat_qwen import ChatQwen
 bot = ChatQwen(sys.argv[1]) if len(sys.argv) > 1 else ChatQwen()
 class Server(socketserver.TCPServer):
    # Avoid "address already used" error when frequently restarting the script 
    allow_reuse_address = True
 class Handler(http.server.BaseHTTPRequestHandler):
    def do_GET(self):
        self.send_response(200)
        self.end_headers()
        self.wfile.write("Use POST with JSON body of the format \n"
                         "[{\"role\": \"user\", \"content\": \"message\"}] \n"
                         "or \n"
                         "[{\"role\": \"user\", \"content\": \"message\"}, "
                         "{\"role\": \"assistant\", \"content\": \"message\"}, "
                         "{\"role\": \"user\", \"content\": \"message\"}]".encode("utf-8"))
    def do_POST(self):
        try:
            content_len = int(self.headers.get('Content-Length'))
            post_body = self.rfile.read(content_len)
            json_body = json.loads(post_body)
            response = bot.generate(json_body)
            self.send_response(200)
            self.end_headers()
            self.wfile.write(json.dumps(response).encode("utf-8"))
        except:
            self.send_response(400)
            self.end_headers()
 with Server(("0.0.0.0", 8900), Handler) as httpd:
    httpd.serve_forever()
--- a/train/prepare/helpsteer/helpsteer22jsonl.py
+++ b/train/prepare/helpsteer/helpsteer22jsonl.py
@@ -0,0 +1,27 @@
 import json
 import os
 this_dir = os.path.dirname(os.path.abspath(__file__))
 def mkdir(path):
    if not os.path.isdir(path):
        os.mkdir(path)
 mkdir(this_dir + '/../../data')
 mkdir(this_dir + '/../../data/helpsteer')
 for filename in ['train.jsonl', 'validation.jsonl']:
    with open(this_dir + '/' + filename, 'r') as f:
        lines = f.readlines()
    role_dict = {'prompt': 'user', 'response': 'assistant'}
    lines = [json.loads(line) for line in lines]
    conversations = [{'messages': [{'role': 'user', 'content': line['prompt']},
                                   {'role': 'assistant', 'content': line['response']}]} for line in lines]
    print(conversations[0])
    with open(this_dir + '/../../data/helpsteer/helpsteer_' + filename[0:-6] + '_all.jsonl', 'w') as f:
        f.writelines([json.dumps(conv) + '\n' for conv in conversations])
--- a/train/prepare/oasst2/oasst22jsonl.py
+++ b/train/prepare/oasst2/oasst22jsonl.py
@@ -0,0 +1,42 @@
 import json
 import os
 # parsing OA data files with oasst_data helpers
 from oasst_data import read_message_trees, ExportMessageNode
 messages: list[ExportMessageNode] = []
 this_dir = os.path.dirname(os.path.abspath(__file__))
 input_file_path = this_dir + '/2023-11-05_oasst2_all.trees.jsonl.gz'
 role_dict = {'prompter': 'user', 'assistant': 'assistant'}
 conversations = []
 def visit(node: ExportMessageNode, parents: [ExportMessageNode]):
    new_parents = parents + [node]
    if not node.replies:  # end of conversation
        conversations.append({'messages': [{'role': role_dict[p.role], 'content': p.text} for p in new_parents]})
    else:
        for reply in node.replies:
            visit(reply, new_parents)
 for tree in read_message_trees(input_file_path):
    if tree.prompt.lang not in ['en']:  # filtering by language tag (optional)
        continue
    visit(tree.prompt, [])
 print(conversations[0])
 def mkdir(path):
    if not os.path.isdir(path):
        os.mkdir(path)
 mkdir(this_dir + '/../../data')
 mkdir(this_dir + '/../../data/oasst')
 with open(this_dir + '/../../data/oasst/oasst_all.jsonl', 'w') as f:
    f.writelines([json.dumps(conv) + '\n' for conv in conversations])
--- a/train/sft/README.md
+++ b/train/sft/README.md
@@ -0,0 +1,3 @@
 # Documentation
 see https://github.com/QwenLM/Qwen1.5/blob/main/docs/source/training/SFT/example.rst
 or [./example.rst]()
--- a/train/sft/ds_config_zero2.json
+++ b/train/sft/ds_config_zero2.json
@@ -0,0 +1,52 @@
 {
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "none",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": true
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 100,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
 }
--- a/train/sft/ds_config_zero3.json
+++ b/train/sft/ds_config_zero3.json
@@ -0,0 +1,59 @@
 {
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "none",
            "pin_memory": true
        },
        "offload_param": {
            "device": "none",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 100,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
 }
--- a/train/sft/example.rst
+++ b/train/sft/example.rst
@@ -0,0 +1,572 @@
 Example
 ====================================================
 Here we provide a very simple script for supervised finetuning, which is revised from the training
 script in ```Fastchat`` <https://github.com/lm-sys/FastChat>`__. The
 script is used to finetune Qwen with Hugging Face Trainer. You can check
 the script
 `here <https://github.com/QwenLM/Qwen1.5/blob/main/finetune.py>`__. This
 script for supervised finetuning (SFT) has the following features:
 -  Support single-GPU and multi-GPU training;
 -  Support full-parameter tuning,
   `LoRA <https://arxiv.org/abs/2106.09685>`__, and
   `Q-LoRA <https://arxiv.org/abs/2305.14314>`__.
 In the following, we introduce more details about the usage of the
 script.
 Installation
 ------------
 Before you start, make sure you have installed the following packages:
 .. code:: bash
   pip install peft deepspeed optimum accelerate
 Data Preparation
 ----------------
 For data preparation, we advise you to organize the data in a jsonl
 file, where each line is a dictionary as demonstrated below:
 .. code:: json
   {
       "type": "chatml",
       "messages": [
           {
               "role": "system",
               "content": "You are a helpful assistant."
           },
           {
               "role": "user",
               "content": "Tell me something about large language models."
           },
           {
               "role": "assistant",
               "content": "Large language models are a type of language model that is trained on a large corpus of text data. They are capable of generating human-like text and are used in a variety of natural language processing tasks..."
           }
       ],
       "source": "unknown"
   }
 .. code:: json
   {
       "type": "chatml",
       "messages": [
           {
               "role": "system",
               "content": "You are a helpful assistant."
           },
           {
               "role": "user",
               "content": "What is your name?"
           },
           {
               "role": "assistant",
               "content": "My name is Qwen."
           }
       ],
       "source": "self-made"
   }
 Above are two examples of each data sample in the dataset. Each sample
 is a JSON object with the following fields: ``type``, ``messages`` and
 ``source``. ``messages`` is required while the others are optional for
 you to label your data format and data source. The ``messages`` field is
 a list of JSON objects, each of which has two fields: ``role`` and
 ``content``. ``role`` can be ``system``, ``user``, or ``assistant``.
 ``content`` is the text of the message. ``source`` is the source of the
 data, which can be ``self-made``, ``alpaca``, ``open-hermes``, or any
 other string.
 To make the jsonl file, you can use ``json`` to save a list of
 dictionaries to the jsonl file:
 .. code:: python
   import json
   with open('data.jsonl', 'w') as f:
       for sample in samples:
           f.write(json.dumps(sample) + '\n')
 Quickstart
 ----------
 For you to start finetuning quickly, we directly provide a shell script
 for you to run without paying attention to details. You need
 different hyperparameters for different types of training, e.g.,
 single-GPU / multi-GPU training, full-parameter tuning, LoRA, or Q-LoRA.
 .. code:: bash
   cd examples/sft
   bash finetune.sh -m <model_path> -d <data_path> --deepspeed <config_path> [--use_lora True] [--q_lora True]
 Specify the ``<model_path>`` for your model, ``<data_path>`` for your
 data, and ``<config_path>`` for your deepspeed configuration.
 If you use LoRA or Q-LoRA, just add ``--use_lora True`` or
 ``--q_lora True`` based on your requirements.
 This is the simplest way to start finetuning. If you want to change more
 hyperparameters, you can dive into the script and modify those
 parameters.
 Advanced Usages
 ---------------
 In this section, we introduce the details of the scripts, including the
 core python script as well as the corresponding shell script.
 Shell Script
 ~~~~~~~~~~~~~
 Before we introduce the python code, we provide a brief introduction to
 the shell script with commands. We provide some guidance inside the
 shell script and here we take ``finetune.sh`` as an example.
 To set up the environment variables for distributed training (or
 single-GPU training), specify the following variables:
 ``GPUS_PER_NODE``, ``NNODES``, ``NODE_RANK``, ``MASTER_ADDR``, and
 ``MASTER_PORT``. No need to worry too much about them as we provide the
 default settings for you. In the command, you can pass in the argument
 ``-m`` and ``-d`` to specify the model path and data path, respectively.
 You can also pass in the argument ``--deepspeed`` to specify the
 deepspeed configuration file. We provide two configuration files for
 ZeRO2 and ZeRO3, and you can choose one based on your requirements. In
 most cases, we recommend using ZeRO3 for multi-GPU training except for
 Q-LoRA, where we recommend using ZeRO2.
 There are a series of hyperparameters to tune. Passing in ``--bf16`` or
 ``--fp16`` to specify the precision for mixed precision training.
 The other significant hyperparameters include:
 -  ``--output_dir``: the path of your output models or adapters.
 -  ``--num_train_epochs``: the number of training epochs.
 -  ``--gradient_accumulation_steps``: the number of gradient
   accumulation steps.
 -  ``--per_device_train_batch_size``: the batch size per GPU for
   training, and the total batch size is equalt to
   ``per_device_train_batch_size`` :math:`\times` ``number_of_gpus``
   :math:`\times` ``gradient_accumulation_steps``.
 -  ``--learning_rate``: the learning rate.
 -  ``--warmup_steps``: the number of warmup steps.
 -  ``--lr_scheduler_type``: the type of learning rate scheduler.
 -  ``--weight_decay``: the value of weight decay.
 -  ``--adam_beta2``: the value of :math:`\beta_2` in Adam.
 -  ``--model_max_length``: the maximum sequence length.
 -  ``--use_lora``: whether to use LoRA. Adding ``--q_lora`` can enable
   Q-LoRA.
 -  ``--gradient_checkpointing``: whether to use gradient checkpointing.
 Python Script
 ~~~~~~~~~~~~~
 In this script, we mainly use ``trainer`` from HF and ``peft`` to train
 our models. We also use ``deepspeed`` to accelerate the training
 process. The script is very simple and easy to understand.
 .. code:: python
   @dataclass
   @dataclass
   class ModelArguments:
       model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B")
   @dataclass
   class DataArguments:
       data_path: str = field(
           default=None, metadata={"help": "Path to the training data."}
       )
       eval_data_path: str = field(
           default=None, metadata={"help": "Path to the evaluation data."}
       )
       lazy_preprocess: bool = False
   @dataclass
   class TrainingArguments(transformers.TrainingArguments):
       cache_dir: Optional[str] = field(default=None)
       optim: str = field(default="adamw_torch")
       model_max_length: int = field(
           default=8192,
           metadata={
               "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
           },
       )
       use_lora: bool = False
   @dataclass
   class LoraArguments:
       lora_r: int = 64
       lora_alpha: int = 16
       lora_dropout: float = 0.05
       lora_target_modules: List[str] = field(
           default_factory=lambda: [
               "q_proj",
               "k_proj",
               "v_proj",
               "o_proj",
               "up_proj",
               "gate_proj",
               "down_proj",
           ]
       )
       lora_weight_path: str = ""
       lora_bias: str = "none"
       q_lora: bool = False
 The classes for arguments allow you to specify hyperparameters for
 model, data, training, and additionally LoRA if you use LoRA or Q-LoRA
 to train your model. Specifically, ``model-max-length`` is a key
 hyperparameter that determines your maximum sequence length of your
 training data.
 ``LoRAArguments`` includes the hyperparameters for LoRA or Q-LoRA:
 -  ``lora_r``: the rank for LoRA;
 -  ``lora_alpha``: the alpha value for LoRA;
 -  ``lora_dropout``: the dropout rate for LoRA;
 -  ``lora_target_modules``: the target modules for LoRA. By default we
   tune all linear layers;
 -  ``lora_weight_path``: the path to the weight file for LoRA;
 -  ``lora_bias``: the bias for LoRA;
 -  ``q_lora``: whether to use Q-LoRA.
 .. code:: python
   def maybe_zero_3(param):
       if hasattr(param, "ds_id"):
           assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
           with zero.GatheredParameters([param]):
               param = param.data.detach().cpu().clone()
       else:
           param = param.detach().cpu().clone()
       return param
   # Borrowed from peft.utils.get_peft_model_state_dict
   def get_peft_state_maybe_zero_3(named_params, bias):
       if bias == "none":
           to_return = {k: t for k, t in named_params if "lora_" in k}
       elif bias == "all":
           to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
       elif bias == "lora_only":
           to_return = {}
           maybe_lora_bias = {}
           lora_bias_names = set()
           for k, t in named_params:
               if "lora_" in k:
                   to_return[k] = t
                   bias_name = k.split("lora_")[0] + "bias"
                   lora_bias_names.add(bias_name)
               elif "bias" in k:
                   maybe_lora_bias[k] = t
           for k, t in maybe_lora_bias:
               if bias_name in lora_bias_names:
                   to_return[bias_name] = t
       else:
           raise NotImplementedError
       to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
       return to_return
   def safe_save_model_for_hf_trainer(
       trainer: transformers.Trainer, output_dir: str, bias="none"
   ):
       """Collects the state dict and dump to disk."""
       # check if zero3 mode enabled
       if deepspeed.is_deepspeed_zero3_enabled():
           state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
       else:
           if trainer.args.use_lora:
               state_dict = get_peft_state_maybe_zero_3(
                   trainer.model.named_parameters(), bias
               )
           else:
               state_dict = trainer.model.state_dict()
       if trainer.args.should_save and trainer.args.local_rank == 0:
           trainer._save(output_dir, state_dict=state_dict)
 The method ``safe_save_model_for_hf_trainer``, which uses
 ``get_peft_state_maybe_zero_3``, helps tackle the problems in saving
 models trained either with or without ZeRO3.
 .. code:: python
   def preprocess(
       messages,
       tokenizer: transformers.PreTrainedTokenizer,
       max_len: int,
   ) -> Dict:
       """Preprocesses the data for supervised fine-tuning."""
       texts = []
       for i, msg in enumerate(messages):
           texts.append(
               tokenizer.apply_chat_template(
                   msg,
                   tokenize=True,
                   add_generation_prompt=False,
                   padding=True,
                   max_length=max_len,
                   truncation=True,
               )
           )
       input_ids = torch.tensor(texts, dtype=torch.int)
       target_ids = input_ids.clone()
       target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID
       attention_mask = input_ids.ne(tokenizer.pad_token_id)
       return dict(
           input_ids=input_ids, target_ids=target_ids, attention_mask=attention_mask
       )
 For data preprocessing, we use ``preprocess`` to organize the data.
 Specifically, we apply our ChatML template to the texts. If you prefer
 other chat templates, you can use others, e.g., by still applying
 ``apply_chat_template()`` with another tokenizer. The chat template is
 stored in the ``tokenizer_config.json`` in the HF repo. Additionally, we
 pad the sequence of each sample to the maximum length for training.
 .. code:: python
   class SupervisedDataset(Dataset):
       """Dataset for supervised fine-tuning."""
       def __init__(
           self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
       ):
           super(SupervisedDataset, self).__init__()
           rank0_print("Formatting inputs...")
           messages = [example["messages"] for example in raw_data]
           data_dict = preprocess(messages, tokenizer, max_len)
           self.input_ids = data_dict["input_ids"]
           self.target_ids = data_dict["target_ids"]
           self.attention_mask = data_dict["attention_mask"]
       def __len__(self):
           return len(self.input_ids)
       def __getitem__(self, i) -> Dict[str, torch.Tensor]:
           return dict(
               input_ids=self.input_ids[i],
               labels=self.labels[i],
               attention_mask=self.attention_mask[i],
           )
   class LazySupervisedDataset(Dataset):
       """Dataset for supervised fine-tuning."""
       def __init__(
           self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
       ):
           super(LazySupervisedDataset, self).__init__()
           self.tokenizer = tokenizer
           self.max_len = max_len
           rank0_print("Formatting inputs...Skip in lazy mode")
           self.tokenizer = tokenizer
           self.raw_data = raw_data
           self.cached_data_dict = {}
       def __len__(self):
           return len(self.raw_data)
       def __getitem__(self, i) -> Dict[str, torch.Tensor]:
           if i in self.cached_data_dict:
               return self.cached_data_dict[i]
           ret = preprocess([self.raw_data[i]["messages"]], self.tokenizer, self.max_len)
           ret = dict(
               input_ids=ret["input_ids"][0],
               labels=ret["target_ids"][0],
               attention_mask=ret["attention_mask"][0],
           )
           self.cached_data_dict[i] = ret
           return ret
   def make_supervised_data_module(
       tokenizer: transformers.PreTrainedTokenizer,
       data_args,
       max_len,
   ) -> Dict:
       """Make dataset and collator for supervised fine-tuning."""
       dataset_cls = (
           LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
       )
       rank0_print("Loading data...")
       train_data = []
       with open(data_args.data_path, "r") as f:
           for line in f:
               train_data.append(json.loads(line))
       train_dataset = dataset_cls(train_data, tokenizer=tokenizer, max_len=max_len)
       if data_args.eval_data_path:
           eval_data = []
           with open(data_args.eval_data_path, "r") as f:
               for line in f:
                   eval_data.append(json.loads(line))
           eval_dataset = dataset_cls(eval_data, tokenizer=tokenizer, max_len=max_len)
       else:
           eval_dataset = None
       return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
 Then we utilize ``make_supervised_data_module`` by using
 ``SupervisedDataset`` or ``LazySupervisedDataset`` to build the dataset.
 .. code:: python
   def train():
       global local_rank
       parser = transformers.HfArgumentParser(
           (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
       )
       (
           model_args,
           data_args,
           training_args,
           lora_args,
       ) = parser.parse_args_into_dataclasses()
       # This serves for single-gpu qlora.
       if (
           getattr(training_args, "deepspeed", None)
           and int(os.environ.get("WORLD_SIZE", 1)) == 1
       ):
           training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
       local_rank = training_args.local_rank
       device_map = None
       world_size = int(os.environ.get("WORLD_SIZE", 1))
       ddp = world_size != 1
       if lora_args.q_lora:
           device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto"
           if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
               logging.warning("FSDP or ZeRO3 is incompatible with QLoRA.")
       model_load_kwargs = {
           "low_cpu_mem_usage": not deepspeed.is_deepspeed_zero3_enabled(),
       }
       compute_dtype = (
           torch.float16
           if training_args.fp16
           else (torch.bfloat16 if training_args.bf16 else torch.float32)
       )
       # Load model and tokenizer
       config = transformers.AutoConfig.from_pretrained(
           model_args.model_name_or_path,
           cache_dir=training_args.cache_dir,
       )
       config.use_cache = False
       model = AutoModelForCausalLM.from_pretrained(
           model_args.model_name_or_path,
           config=config,
           cache_dir=training_args.cache_dir,
           device_map=device_map,
           quantization_config=BitsAndBytesConfig(
               load_in_4bit=True,
               bnb_4bit_use_double_quant=True,
               bnb_4bit_quant_type="nf4",
               bnb_4bit_compute_dtype=compute_dtype,
           )
           if training_args.use_lora and lora_args.q_lora
           else None,
           **model_load_kwargs,
       )
       tokenizer = AutoTokenizer.from_pretrained(
           model_args.model_name_or_path,
           cache_dir=training_args.cache_dir,
           model_max_length=training_args.model_max_length,
           padding_side="right",
           use_fast=False,
       )
       if training_args.use_lora:
           lora_config = LoraConfig(
               r=lora_args.lora_r,
               lora_alpha=lora_args.lora_alpha,
               target_modules=lora_args.lora_target_modules,
               lora_dropout=lora_args.lora_dropout,
               bias=lora_args.lora_bias,
               task_type="CAUSAL_LM",
           )
           if lora_args.q_lora:
               model = prepare_model_for_kbit_training(
                   model, use_gradient_checkpointing=training_args.gradient_checkpointing
               )
           model = get_peft_model(model, lora_config)
           # Print peft trainable params
           model.print_trainable_parameters()
           if training_args.gradient_checkpointing:
               model.enable_input_require_grads()
       # Load data
       data_module = make_supervised_data_module(
           tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length
       )
       # Start trainer
       trainer = Trainer(
           model=model, tokenizer=tokenizer, args=training_args, **data_module
       )
       # `not training_args.use_lora` is a temporary workaround for the issue that there are problems with
       # loading the checkpoint when using LoRA with DeepSpeed.
       # Check this issue https://github.com/huggingface/peft/issues/746 for more information.
       if (
           list(pathlib.Path(training_args.output_dir).glob("checkpoint-*"))
           and not training_args.use_lora
       ):
           trainer.train(resume_from_checkpoint=True)
       else:
           trainer.train()
       trainer.save_state()
       safe_save_model_for_hf_trainer(
           trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias
       )
 The ``train`` method is the key to the training. In general, it loads
 the tokenizer and model with ``AutoTokenizer.from_pretrained()`` and
 ``AutoModelForCausalLM.from_pretrained()``. If we use LoRA, the method
 will initialize LoRA configuration with ``LoraConfig``. If we apply
 Q-LoRA, we should use ``prepare_model_for_kbit_training``. Note that for
 now it still does not support resume for LoRA. Then we leave the
 following efforts to ``trainer`` and have a cup of coffee!
 Next Step
 ---------
 Now, you are able to use a very simple script to perform different types
 of SFT. Alternatively, you can use more advanced training libraries,
 such as
 `Axolotl <https://github.com/OpenAccess-AI-Collective/axolotl>`__ or
 `LLaMA-Factory <https://github.com/hiyouga/LLaMA-Factory>`__, to enjoy
 more functionalities. To take a step forward, after SFT, you can
 consider RLHF to align your model to human preferences! Stay tuned for
 our next tutorial on RLHF!
--- a/train/sft/finetune.py
+++ b/train/sft/finetune.py
@@ -0,0 +1,378 @@
 # This code is based on the revised code from fastchat based on tatsu-lab/stanford_alpaca.
 from dataclasses import dataclass, field
 import json
 import logging
 import os
 import pathlib
 from typing import Dict, Optional, List
 import torch
 from torch.utils.data import Dataset
 from deepspeed import zero
 from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers import Trainer, BitsAndBytesConfig, deepspeed
 from transformers.trainer_pt_utils import LabelSmoother
 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 from accelerate.utils import DistributedType
 IGNORE_TOKEN_ID = LabelSmoother.ignore_index
 TEMPLATE = "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{ '<|im_end|>'}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}"
 local_rank = None
 def rank0_print(*args):
    if local_rank == 0:
        print(*args)
@dataclass
 class ModelArguments:
    model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B")
@dataclass
 class DataArguments:
    data_path: str = field(
        default=None, metadata={"help": "Path to the training data."}
    )
    eval_data_path: str = field(
        default=None, metadata={"help": "Path to the evaluation data."}
    )
    lazy_preprocess: bool = False
@dataclass
 class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(
        default=8192,
        metadata={
            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )
    use_lora: bool = False
@dataclass
 class LoraArguments:
    lora_r: int = 64
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    lora_target_modules: List[str] = field(
        default_factory=lambda: [
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "up_proj",
            "gate_proj",
            "down_proj",
        ]
    )
    lora_weight_path: str = ""
    lora_bias: str = "none"
    q_lora: bool = False
 def maybe_zero_3(param):
    if hasattr(param, "ds_id"):
        assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
        with zero.GatheredParameters([param]):
            param = param.data.detach().cpu().clone()
    else:
        param = param.detach().cpu().clone()
    return param
 # Borrowed from peft.utils.get_peft_model_state_dict
 def get_peft_state_maybe_zero_3(named_params, bias):
    if bias == "none":
        to_return = {k: t for k, t in named_params if "lora_" in k}
    elif bias == "all":
        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
    elif bias == "lora_only":
        to_return = {}
        maybe_lora_bias = {}
        lora_bias_names = set()
        for k, t in named_params:
            if "lora_" in k:
                to_return[k] = t
                bias_name = k.split("lora_")[0] + "bias"
                lora_bias_names.add(bias_name)
            elif "bias" in k:
                maybe_lora_bias[k] = t
        for k, t in maybe_lora_bias:
            if bias_name in lora_bias_names:
                to_return[bias_name] = t
    else:
        raise NotImplementedError
    to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
    return to_return
 def safe_save_model_for_hf_trainer(
    trainer: transformers.Trainer, output_dir: str, bias="none"
 ):
    """Collects the state dict and dump to disk."""
    # check if zero3 mode enabled
    if deepspeed.is_deepspeed_zero3_enabled():
        state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
    else:
        if trainer.args.use_lora:
            state_dict = get_peft_state_maybe_zero_3(
                trainer.model.named_parameters(), bias
            )
        else:
            state_dict = trainer.model.state_dict()
    if trainer.args.should_save and trainer.args.local_rank == 0:
        trainer._save(output_dir, state_dict=state_dict)
 def preprocess(
    messages,
    tokenizer: transformers.PreTrainedTokenizer,
    max_len: int,
 ) -> Dict:
    """Preprocesses the data for supervised fine-tuning."""
    texts = []
    for i, msg in enumerate(messages):
        texts.append(
            tokenizer.apply_chat_template(
                msg,
                chat_template=TEMPLATE,
                tokenize=True,
                add_generation_prompt=False,
                padding=True,
                max_length=max_len,
                truncation=True,
            )
        )
    input_ids = torch.tensor(texts, dtype=torch.int)
    target_ids = input_ids.clone()
    target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID
    attention_mask = input_ids.ne(tokenizer.pad_token_id)
    return dict(
        input_ids=input_ids, target_ids=target_ids, attention_mask=attention_mask
    )
 class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""
    def __init__(
        self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
    ):
        super(SupervisedDataset, self).__init__()
        rank0_print("Formatting inputs...")
        messages = [example["messages"] for example in raw_data]
        data_dict = preprocess(messages, tokenizer, max_len)
        self.input_ids = data_dict["input_ids"]
        self.target_ids = data_dict["target_ids"]
        self.attention_mask = data_dict["attention_mask"]
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(
            input_ids=self.input_ids[i],
            labels=self.target_ids[i],
            attention_mask=self.attention_mask[i],
        )
 class LazySupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""
    def __init__(
        self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int
    ):
        super(LazySupervisedDataset, self).__init__()
        self.tokenizer = tokenizer
        self.max_len = max_len
        rank0_print("Formatting inputs...Skip in lazy mode")
        self.tokenizer = tokenizer
        self.raw_data = raw_data
        self.cached_data_dict = {}
    def __len__(self):
        return len(self.raw_data)
    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        if i in self.cached_data_dict:
            return self.cached_data_dict[i]
        ret = preprocess([self.raw_data[i]["messages"]], self.tokenizer, self.max_len)
        ret = dict(
            input_ids=ret["input_ids"][0],
            labels=ret["target_ids"][0],
            attention_mask=ret["attention_mask"][0],
        )
        self.cached_data_dict[i] = ret
        return ret
 def make_supervised_data_module(
    tokenizer: transformers.PreTrainedTokenizer,
    data_args,
    max_len,
 ) -> Dict:
    """Make dataset and collator for supervised fine-tuning."""
    dataset_cls = (
        LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
    )
    rank0_print("Loading data...")
    train_data = []
    with open(data_args.data_path, "r") as f:
        for line in f:
            train_data.append(json.loads(line))
    train_dataset = dataset_cls(train_data, tokenizer=tokenizer, max_len=max_len)
    if data_args.eval_data_path:
        eval_data = []
        with open(data_args.eval_data_path, "r") as f:
            for line in f:
                eval_data.append(json.loads(line))
        eval_dataset = dataset_cls(eval_data, tokenizer=tokenizer, max_len=max_len)
    else:
        eval_dataset = None
    return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
 def train():
    global local_rank
    parser = transformers.HfArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
    )
    (
        model_args,
        data_args,
        training_args,
        lora_args,
    ) = parser.parse_args_into_dataclasses()
    # This serves for single-gpu qlora.
    if (
        getattr(training_args, "deepspeed", None)
        and int(os.environ.get("WORLD_SIZE", 1)) == 1
    ):
        training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
    local_rank = training_args.local_rank
    device_map = None
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    ddp = world_size != 1
    if lora_args.q_lora:
        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else "auto"
        if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
            logging.warning("FSDP or ZeRO3 is incompatible with QLoRA.")
    model_load_kwargs = {
        "low_cpu_mem_usage": not deepspeed.is_deepspeed_zero3_enabled(),
    }
    compute_dtype = (
        torch.float16
        if training_args.fp16
        else (torch.bfloat16 if training_args.bf16 else torch.float32)
    )
    # Load model and tokenizer
    config = transformers.AutoConfig.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=training_args.cache_dir,
    )
    config.use_cache = False
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path,
        config=config,
        cache_dir=training_args.cache_dir,
        device_map=device_map,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
        )
        if training_args.use_lora and lora_args.q_lora
        else None,
        **model_load_kwargs,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=training_args.cache_dir,
        model_max_length=training_args.model_max_length,
        padding_side="right",
        use_fast=False,
    )
    if training_args.use_lora:
        lora_config = LoraConfig(
            r=lora_args.lora_r,
            lora_alpha=lora_args.lora_alpha,
            target_modules=lora_args.lora_target_modules,
            lora_dropout=lora_args.lora_dropout,
            bias=lora_args.lora_bias,
            task_type="CAUSAL_LM",
        )
        if lora_args.q_lora:
            model = prepare_model_for_kbit_training(
                model, use_gradient_checkpointing=training_args.gradient_checkpointing
            )
        model = get_peft_model(model, lora_config)
        # Print peft trainable params
        model.print_trainable_parameters()
        if training_args.gradient_checkpointing:
            model.enable_input_require_grads()
    # Load data
    data_module = make_supervised_data_module(
        tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length
    )
    # Start trainer
    trainer = Trainer(
        model=model, tokenizer=tokenizer, args=training_args, **data_module
    )
    # `not training_args.use_lora` is a temporary workaround for the issue that there are problems with
    # loading the checkpoint when using LoRA with DeepSpeed.
    # Check this issue https://github.com/huggingface/peft/issues/746 for more information.
    if (
        list(pathlib.Path(training_args.output_dir).glob("checkpoint-*"))
        and not training_args.use_lora
    ):
        trainer.train(resume_from_checkpoint=True)
    else:
        trainer.train()
    trainer.save_state()
    safe_save_model_for_hf_trainer(
        trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias
    )
 if __name__ == "__main__":
    train()
--- a/train/sft/finetune.sh
+++ b/train/sft/finetune.sh
@@ -0,0 +1,107 @@
 #!/bin/bash
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 DIR=`pwd`
 # Guide:
 # This script supports distributed training on multi-gpu workers (as well as single-worker training).
 # Please set the options below according to the comments.
 # For multi-gpu workers training, these options should be manually set for each worker.
 # After setting the options, please run the script on each worker.
 # Number of GPUs per GPU worker
 GPUS_PER_NODE=$(python -c 'import torch; print(torch.cuda.device_count())')
 # Number of GPU workers, for single-worker training, please set to 1
 NNODES=${NNODES:-1}
 # The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
 NODE_RANK=${NODE_RANK:-0}
 # The ip address of the rank-0 worker, for single-worker training, please set to localhost
 MASTER_ADDR=${MASTER_ADDR:-localhost}
 # The port for communication
 MASTER_PORT=${MASTER_PORT:-6001}
 MODEL="Qwen/Qwen1.5-7B" # Set the path if you do not want to load from huggingface directly
 # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 # See the section for finetuning in README for more information.
 DATA="path_to_data"
 DS_CONFIG_PATH="ds_config_zero3.json"
 USE_LORA=False
 Q_LORA=False
 function usage() {
    echo '
 Usage: bash finetune/finetune_lora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] [--deepspeed DS_CONFIG_PATH] [--use_lora USE_LORA] [--q_lora Q_LORA]
 '
 }
 while [[ "$1" != "" ]]; do
    case $1 in
        -m | --model )
            shift
            MODEL=$1
            ;;
        -d | --data )
            shift
            DATA=$1
            ;;
        --deepspeed )
            shift
            DS_CONFIG_PATH=$1
            ;;
        --use_lora  )
            shift
            USE_LORA=$1
            ;;
        --q_lora    )
            shift
            Q_LORA=$1
            ;;
        -h | --help )
            usage
            exit 0
            ;;
        * )
            echo "Unknown argument ${1}"
            exit 1
            ;;
    esac
    shift
 done
 DISTRIBUTED_ARGS="
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
 "
 torchrun $DISTRIBUTED_ARGS finetune.py \
    --model_name_or_path $MODEL \
    --data_path $DATA \
    --bf16 False \
    --output_dir output_qwen \
    --num_train_epochs 5 \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 8 \
    --evaluation_strategy "no" \
    --save_strategy "steps" \
    --save_steps 10 \
    --save_total_limit 10 \
    --learning_rate 3e-4 \
    --weight_decay 0.01 \
    --adam_beta2 0.95 \
    --warmup_ratio 0.01 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --report_to "none" \
    --model_max_length 512 \
    --lazy_preprocess True \
    --use_lora ${USE_LORA} \
    --q_lora ${Q_LORA} \
    --gradient_checkpointing \
    --deepspeed ${DS_CONFIG_PATH}
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/conversation.py
+++ b/utils/conversation.py
@@ -0,0 +1,61 @@
 import datetime
 import json
 import os
 import pickle
 def load_conversation(model_id):
    folder = 'conversations/' + model_id.replace('/', '_')
    mkdir('../conversations')
    mkdir(folder)
    files = os.listdir(folder)
    files = [file for file in files if file.endswith(".pickle") and os.path.isfile(folder + '/' + file)]
    files.sort(reverse=True)
    if len(files) > 0:
        pickle_filename = folder + '/' + files[0]
        print('Loading last conversation from ' + pickle_filename)
        with open(pickle_filename, 'rb') as file:
            return pickle.load(file)
    return []
 def save_conversation(model_id, messages):
    folder = 'conversations/' + model_id.replace('/', '_')
    mkdir('../conversations')
    mkdir(folder)
    timestamp = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
    pickle_filename = folder + '/' + timestamp + '.pickle'
    with open(pickle_filename, 'wb') as file:
        pickle.dump(messages, file)
 def load_conversation_json(model_id):
    folder = 'conversations/' + model_id.replace('/', '_')
    mkdir('../conversations')
    mkdir(folder)
    files = os.listdir(folder)
    files = [file for file in files if file.endswith(".json") and os.path.isfile(folder + '/' + file)]
    files.sort(reverse=True)
    if len(files) > 0:
        pickle_filename = folder + '/' + files[0]
        print('Loading last conversation from ' + pickle_filename)
        with open(pickle_filename, 'r') as file:
            return json.load(file)
    return []
 def save_conversation_json(model_id, messages):
    folder = 'conversations/' + model_id.replace('/', '_')
    mkdir('../conversations')
    mkdir(folder)
    timestamp = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
    pickle_filename = folder + '/' + timestamp + '.json'
    with open(pickle_filename, 'w') as file:
        json.dump(messages, file)
 def mkdir(path):
    if not os.path.isdir(path):
        os.mkdir(path)
--- a/utils/download_dataset.py
+++ b/utils/download_dataset.py
@@ -0,0 +1,12 @@
 import pickle
 from conversation import mkdir
 from datasets import load_dataset
 dataset_id = 'OpenAssistant/oasst2'
 mkdir('../datasets')
 pickle_filename = './datasets/' + dataset_id.replace('/', '_') + '.pickle'
 dataset = load_dataset(dataset_id)
 with open(pickle_filename, 'wb') as file:
    pickle.dump(dataset, file)
 print('Saved as pickle to ' + pickle_filename)
--- a/utils/download_model.py
+++ b/utils/download_model.py
@@ -0,0 +1,18 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer
 from conversation import mkdir
 model_id = 'Qwen/Qwen1.5-0.5B-Chat'
 # model_id = 'Qwen/Qwen1.5-1.8B-Chat'
 print('Downloading ' + model_id)
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map='auto')
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 print('Downloaded')
 trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
 )
 mkdir('models')
 trainer.save_model('./models/' + model_id.replace('/', '_'))
--- a/utils/fix_cuda.sh
+++ b/utils/fix_cuda.sh
@@ -0,0 +1,3 @@
 #https://medium.com/@Spritan/dealing-with-cuda-initialization-error-aa7c88d021e4
 sudo rmmod nvidia_uvm
 sudo modprobe nvidia_uvm
--- a/utils/pickle2json.py
+++ b/utils/pickle2json.py
@@ -0,0 +1,22 @@
 import json
 import pickle
 import sys
 files = sys.argv[1:]
 print(files)
 for pickle_filename in files:
    if not pickle_filename.endswith('.pickle'):
        print(pickle_filename + ' is not a pickle. ignoring')
        continue
    with open(pickle_filename, 'rb') as file:
        obj = pickle.load(file)
        print(obj)
    json_filename = pickle_filename[0:-6] + 'json'
    try:
        with open(json_filename, 'w') as file:
            json.dump(obj, file)
    except Exception as e:
        print(e)
--- a/utils/prompt.py
+++ b/utils/prompt.py
@@ -0,0 +1,14 @@
 def prompt(prompt):
    while True:
        try:
            return input(prompt)
        except EOFError:
            print()
            exit(0)
        except KeyboardInterrupt:
            print()
            exit(0)
        # in case: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 11: invalid continuation byte
        except UnicodeDecodeError as e:
            print(e)
            print('prompt ignored')
--- a/utils/split_shuffle_dataset.py
+++ b/utils/split_shuffle_dataset.py
@@ -0,0 +1,25 @@
 import json
 import os
 import random
 import sys
 original = sys.argv[1]
 no_dataset = int(sys.argv[2])
 if not original.endswith('.jsonl') or not os.path.isfile(original):
    print('Not a jsonl file')
    exit(1)
 out_dir = os.path.dirname(os.path.abspath(original))
 with open(original, 'r') as f:
    lines = f.readlines()
 random.shuffle(lines)
 for i in range(no_dataset):
    l = int(i * len(lines) / no_dataset)
    u = int((i + 1) * len(lines) / no_dataset)
    out_filename = os.path.basename(original)[0:-6].replace('_all', '_' + str(i)) + '.jsonl'
    with open(out_dir + '/' + out_filename, 'w') as f:
        f.writelines(lines[l:u])