initial commit
This commit is contained in:
27
train/prepare/helpsteer/helpsteer22jsonl.py
Normal file
27
train/prepare/helpsteer/helpsteer22jsonl.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
this_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
def mkdir(path):
|
||||
if not os.path.isdir(path):
|
||||
os.mkdir(path)
|
||||
|
||||
|
||||
mkdir(this_dir + '/../../data')
|
||||
mkdir(this_dir + '/../../data/helpsteer')
|
||||
|
||||
for filename in ['train.jsonl', 'validation.jsonl']:
|
||||
with open(this_dir + '/' + filename, 'r') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
role_dict = {'prompt': 'user', 'response': 'assistant'}
|
||||
lines = [json.loads(line) for line in lines]
|
||||
conversations = [{'messages': [{'role': 'user', 'content': line['prompt']},
|
||||
{'role': 'assistant', 'content': line['response']}]} for line in lines]
|
||||
|
||||
print(conversations[0])
|
||||
|
||||
with open(this_dir + '/../../data/helpsteer/helpsteer_' + filename[0:-6] + '_all.jsonl', 'w') as f:
|
||||
f.writelines([json.dumps(conv) + '\n' for conv in conversations])
|
||||
42
train/prepare/oasst2/oasst22jsonl.py
Normal file
42
train/prepare/oasst2/oasst22jsonl.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
# parsing OA data files with oasst_data helpers
|
||||
from oasst_data import read_message_trees, ExportMessageNode
|
||||
|
||||
messages: list[ExportMessageNode] = []
|
||||
|
||||
this_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
input_file_path = this_dir + '/2023-11-05_oasst2_all.trees.jsonl.gz'
|
||||
|
||||
role_dict = {'prompter': 'user', 'assistant': 'assistant'}
|
||||
conversations = []
|
||||
|
||||
|
||||
def visit(node: ExportMessageNode, parents: [ExportMessageNode]):
|
||||
new_parents = parents + [node]
|
||||
if not node.replies: # end of conversation
|
||||
conversations.append({'messages': [{'role': role_dict[p.role], 'content': p.text} for p in new_parents]})
|
||||
else:
|
||||
for reply in node.replies:
|
||||
visit(reply, new_parents)
|
||||
|
||||
|
||||
for tree in read_message_trees(input_file_path):
|
||||
if tree.prompt.lang not in ['en']: # filtering by language tag (optional)
|
||||
continue
|
||||
|
||||
visit(tree.prompt, [])
|
||||
|
||||
print(conversations[0])
|
||||
|
||||
|
||||
def mkdir(path):
|
||||
if not os.path.isdir(path):
|
||||
os.mkdir(path)
|
||||
|
||||
|
||||
mkdir(this_dir + '/../../data')
|
||||
mkdir(this_dir + '/../../data/oasst')
|
||||
with open(this_dir + '/../../data/oasst/oasst_all.jsonl', 'w') as f:
|
||||
f.writelines([json.dumps(conv) + '\n' for conv in conversations])
|
||||
Reference in New Issue
Block a user