added util to reduce (partial) duplicate recorded conversations
This commit is contained in:
37
utils/reduce_conversations.py
Normal file
37
utils/reduce_conversations.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
this_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
conversations_dir = this_dir + '/../conversations/'
|
||||||
|
|
||||||
|
dirs = [os.path.relpath(conversations_dir + dir) for dir in os.listdir(conversations_dir)]
|
||||||
|
dirs = [dir for dir in dirs if os.path.isdir(dir)]
|
||||||
|
|
||||||
|
|
||||||
|
def read_json(path):
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
for dir in dirs:
|
||||||
|
print('Reducing ' + dir)
|
||||||
|
files = [dir + '/' + file for file in os.listdir(dir)]
|
||||||
|
files = [file for file in files if os.path.isfile(file)]
|
||||||
|
files.sort()
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(files) - 1:
|
||||||
|
j = i + 1
|
||||||
|
while j < len(files):
|
||||||
|
conversation1 = read_json(files[i])
|
||||||
|
conversation2 = read_json(files[j])
|
||||||
|
|
||||||
|
if len(conversation1) <= len(conversation2) and conversation1 == conversation2[0:len(conversation1)]:
|
||||||
|
print(' Dropping ' + files[i])
|
||||||
|
os.remove(files[i])
|
||||||
|
del files[i]
|
||||||
|
i -= 1
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
j += 1
|
||||||
|
i += 1
|
||||||
Reference in New Issue
Block a user