initial commit

This commit is contained in:
wea_ondara
2024-04-17 18:58:50 +02:00
commit 41d5b4f591
22 changed files with 1611 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
import json
import os
import random
import sys
original = sys.argv[1]
no_dataset = int(sys.argv[2])
if not original.endswith('.jsonl') or not os.path.isfile(original):
print('Not a jsonl file')
exit(1)
out_dir = os.path.dirname(os.path.abspath(original))
with open(original, 'r') as f:
lines = f.readlines()
random.shuffle(lines)
for i in range(no_dataset):
l = int(i * len(lines) / no_dataset)
u = int((i + 1) * len(lines) / no_dataset)
out_filename = os.path.basename(original)[0:-6].replace('_all', '_' + str(i)) + '.jsonl'
with open(out_dir + '/' + out_filename, 'w') as f:
f.writelines(lines[l:u])