diff --git a/utils/reduce_conversations.py b/utils/reduce_conversations.py new file mode 100644 index 0000000..ec6567b --- /dev/null +++ b/utils/reduce_conversations.py @@ -0,0 +1,37 @@ +import json +import os + +this_dir = os.path.dirname(os.path.abspath(__file__)) +conversations_dir = this_dir + '/../conversations/' + +dirs = [os.path.relpath(conversations_dir + dir) for dir in os.listdir(conversations_dir)] +dirs = [dir for dir in dirs if os.path.isdir(dir)] + + +def read_json(path): + with open(path, 'r') as f: + return json.load(f) + + +for dir in dirs: + print('Reducing ' + dir) + files = [dir + '/' + file for file in os.listdir(dir)] + files = [file for file in files if os.path.isfile(file)] + files.sort() + + i = 0 + while i < len(files) - 1: + j = i + 1 + while j < len(files): + conversation1 = read_json(files[i]) + conversation2 = read_json(files[j]) + + if len(conversation1) <= len(conversation2) and conversation1 == conversation2[0:len(conversation1)]: + print(' Dropping ' + files[i]) + os.remove(files[i]) + del files[i] + i -= 1 + break + else: + j += 1 + i += 1