added util to reduce (partial) duplicate recorded conversations
This commit is contained in:
37
utils/reduce_conversations.py
Normal file
37
utils/reduce_conversations.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
this_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
conversations_dir = this_dir + '/../conversations/'
|
||||
|
||||
dirs = [os.path.relpath(conversations_dir + dir) for dir in os.listdir(conversations_dir)]
|
||||
dirs = [dir for dir in dirs if os.path.isdir(dir)]
|
||||
|
||||
|
||||
def read_json(path):
|
||||
with open(path, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
for dir in dirs:
|
||||
print('Reducing ' + dir)
|
||||
files = [dir + '/' + file for file in os.listdir(dir)]
|
||||
files = [file for file in files if os.path.isfile(file)]
|
||||
files.sort()
|
||||
|
||||
i = 0
|
||||
while i < len(files) - 1:
|
||||
j = i + 1
|
||||
while j < len(files):
|
||||
conversation1 = read_json(files[i])
|
||||
conversation2 = read_json(files[j])
|
||||
|
||||
if len(conversation1) <= len(conversation2) and conversation1 == conversation2[0:len(conversation1)]:
|
||||
print(' Dropping ' + files[i])
|
||||
os.remove(files[i])
|
||||
del files[i]
|
||||
i -= 1
|
||||
break
|
||||
else:
|
||||
j += 1
|
||||
i += 1
|
||||
Reference in New Issue
Block a user