added name to training sample conversion for helpsteer and oasst

This commit is contained in:
wea_ondara
2024-04-24 19:26:22 +02:00
parent f4aefb3278
commit 9cafd8b1e9
2 changed files with 19 additions and 7 deletions

View File

@@ -1,5 +1,7 @@
import json
import os
import random
from typing import AnyStr
# parsing OA data files with oasst_data helpers
from oasst_data import read_message_trees, ExportMessageNode
@@ -10,23 +12,28 @@ this_dir = os.path.dirname(os.path.abspath(__file__))
input_file_path = this_dir + '/2023-11-05_oasst2_all.trees.jsonl.gz'
role_dict = {'prompter': 'user', 'assistant': 'assistant'}
user_names = ['Adam', 'Alice', 'Anne', 'Bob', 'Charlie', 'Cody', 'Corinna', 'Cynthia', 'Fred', 'Grace', 'Jane', 'Paul',
'Rachel', 'Ramesh']
conversations = []
def visit(node: ExportMessageNode, parents: [ExportMessageNode]):
def visit(node: ExportMessageNode, parents: [ExportMessageNode], user: AnyStr):
new_parents = parents + [node]
if not node.replies: # end of conversation
conversations.append({'messages': [{'role': role_dict[p.role], 'content': p.text} for p in new_parents]})
conversations.append({'messages': [{'role': role_dict[p.role],
'name': user if role_dict[p.role] != 'assistant' else 'assistant',
'content': p.text
} for p in new_parents]})
else:
for reply in node.replies:
visit(reply, new_parents)
visit(reply, new_parents, user)
for tree in read_message_trees(input_file_path):
if tree.prompt.lang not in ['en']: # filtering by language tag (optional)
continue
visit(tree.prompt, [])
visit(tree.prompt, [], user_names[random.randint(0, len(user_names) - 1)])
print(conversations[0])