llm/train/prepare/helpsteer/helpsteer22jsonl.py

import json
import os
import random

user_names = ['Adam', 'Alice', 'Anne', 'Bob', 'Charlie', 'Cody', 'Corinna', 'Cynthia', 'Fred', 'Grace', 'Jane', 'Paul',
              'Rachel', 'Ramesh']


def mkdir(path):
    if not os.path.isdir(path):
        os.mkdir(path)


this_dir = os.path.dirname(os.path.abspath(__file__))
mkdir(this_dir + '/../../data')
mkdir(this_dir + '/../../data/helpsteer')

for filename in ['train.jsonl', 'validation.jsonl']:
    with open(this_dir + '/' + filename, 'r') as f:
        lines = f.readlines()

    role_dict = {'prompt': 'user', 'response': 'assistant'}
    lines = [json.loads(line) for line in lines]
    conversations = [{'messages': [
        {'role': 'user', 'name': user_names[random.randint(0, len(user_names) - 1)], 'content': line['prompt']},
        {'role': 'assistant', 'name': 'assistant', 'content': line['response']}]
    } for line in lines]

    print(conversations[0])

    with open(this_dir + '/../../data/helpsteer/helpsteer_' + filename[0:-6] + '_all.jsonl', 'w') as f:
        f.writelines([json.dumps(conv) + '\n' for conv in conversations])