import json import os # parsing OA data files with oasst_data helpers from oasst_data import read_message_trees, ExportMessageNode messages: list[ExportMessageNode] = [] this_dir = os.path.dirname(os.path.abspath(__file__)) input_file_path = this_dir + '/2023-11-05_oasst2_all.trees.jsonl.gz' role_dict = {'prompter': 'user', 'assistant': 'assistant'} conversations = [] def visit(node: ExportMessageNode, parents: [ExportMessageNode]): new_parents = parents + [node] if not node.replies: # end of conversation conversations.append({'messages': [{'role': role_dict[p.role], 'content': p.text} for p in new_parents]}) else: for reply in node.replies: visit(reply, new_parents) for tree in read_message_trees(input_file_path): if tree.prompt.lang not in ['en']: # filtering by language tag (optional) continue visit(tree.prompt, []) print(conversations[0]) def mkdir(path): if not os.path.isdir(path): os.mkdir(path) mkdir(this_dir + '/../../data') mkdir(this_dir + '/../../data/oasst') with open(this_dir + '/../../data/oasst/oasst_all.jsonl', 'w') as f: f.writelines([json.dumps(conv) + '\n' for conv in conversations])