initial commit
This commit is contained in:
25
utils/split_shuffle_dataset.py
Normal file
25
utils/split_shuffle_dataset.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
original = sys.argv[1]
|
||||
no_dataset = int(sys.argv[2])
|
||||
|
||||
if not original.endswith('.jsonl') or not os.path.isfile(original):
|
||||
print('Not a jsonl file')
|
||||
exit(1)
|
||||
|
||||
out_dir = os.path.dirname(os.path.abspath(original))
|
||||
|
||||
with open(original, 'r') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
random.shuffle(lines)
|
||||
|
||||
for i in range(no_dataset):
|
||||
l = int(i * len(lines) / no_dataset)
|
||||
u = int((i + 1) * len(lines) / no_dataset)
|
||||
out_filename = os.path.basename(original)[0:-6].replace('_all', '_' + str(i)) + '.jsonl'
|
||||
with open(out_dir + '/' + out_filename, 'w') as f:
|
||||
f.writelines(lines[l:u])
|
||||
Reference in New Issue
Block a user