From b311c9372a05832019ff782459d5c07d3910e6bc Mon Sep 17 00:00:00 2001 From: wea_ondara Date: Wed, 17 Apr 2024 21:07:49 +0200 Subject: [PATCH] added util to reduce (partial) duplicate recorded conversations --- utils/reduce_conversations.py | 37 +++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 utils/reduce_conversations.py diff --git a/utils/reduce_conversations.py b/utils/reduce_conversations.py new file mode 100644 index 0000000..ec6567b --- /dev/null +++ b/utils/reduce_conversations.py @@ -0,0 +1,37 @@ +import json +import os + +this_dir = os.path.dirname(os.path.abspath(__file__)) +conversations_dir = this_dir + '/../conversations/' + +dirs = [os.path.relpath(conversations_dir + dir) for dir in os.listdir(conversations_dir)] +dirs = [dir for dir in dirs if os.path.isdir(dir)] + + +def read_json(path): + with open(path, 'r') as f: + return json.load(f) + + +for dir in dirs: + print('Reducing ' + dir) + files = [dir + '/' + file for file in os.listdir(dir)] + files = [file for file in files if os.path.isfile(file)] + files.sort() + + i = 0 + while i < len(files) - 1: + j = i + 1 + while j < len(files): + conversation1 = read_json(files[i]) + conversation2 = read_json(files[j]) + + if len(conversation1) <= len(conversation2) and conversation1 == conversation2[0:len(conversation1)]: + print(' Dropping ' + files[i]) + os.remove(files[i]) + del files[i] + i -= 1 + break + else: + j += 1 + i += 1