26 lines
645 B
Python
26 lines
645 B
Python
import json
|
|
import os
|
|
import random
|
|
import sys
|
|
|
|
original = sys.argv[1]
|
|
no_dataset = int(sys.argv[2])
|
|
|
|
if not original.endswith('.jsonl') or not os.path.isfile(original):
|
|
print('Not a jsonl file')
|
|
exit(1)
|
|
|
|
out_dir = os.path.dirname(os.path.abspath(original))
|
|
|
|
with open(original, 'r') as f:
|
|
lines = f.readlines()
|
|
|
|
random.shuffle(lines)
|
|
|
|
for i in range(no_dataset):
|
|
l = int(i * len(lines) / no_dataset)
|
|
u = int((i + 1) * len(lines) / no_dataset)
|
|
out_filename = os.path.basename(original)[0:-6].replace('_all', '_' + str(i)) + '.jsonl'
|
|
with open(out_dir + '/' + out_filename, 'w') as f:
|
|
f.writelines(lines[l:u])
|