From 8ea320812f6367fcefaef29eba39f040688d4a2e Mon Sep 17 00:00:00 2001 From: Steve Howell Date: Thu, 9 Dec 2021 15:17:58 +0000 Subject: [PATCH] user exports: Chunkify messages in sorted order. This accomplishes a few things: * It extracts `chunkify` rather than having us clumsily track chunking-related stuff in a big loop that is doing other stuff. * It makes it so that all message ids in message-000001.json < message-000002.json. * It makes it easier for us to customize the messages we send to a single user (coming soon). BTW we probably have a slicker version of chunkify somewhere in our codebase, but I couldn't remember where. --- zerver/lib/export.py | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/zerver/lib/export.py b/zerver/lib/export.py index aa9cfa9261..629a9c0000 100644 --- a/zerver/lib/export.py +++ b/zerver/lib/export.py @@ -2038,21 +2038,42 @@ def get_single_user_config() -> Config: return user_profile_config +def chunkify(lst: List[int], chunk_size: int) -> List[List[int]]: + # chunkify([1,2,3,4,5], 2) == [[1,2], [3,4], [5]] + result = [] + i = 0 + while True: + chunk = lst[i : i + chunk_size] + if len(chunk) == 0: + break + else: + result.append(chunk) + i += chunk_size + + return result + + def export_messages_single_user( user_profile: UserProfile, output_dir: Path, chunk_size: int = MESSAGE_BATCH_CHUNK_SIZE ) -> None: - user_message_query = UserMessage.objects.filter(user_profile=user_profile).order_by("id") - min_id = -1 - dump_file_id = 1 - while True: - actual_query = user_message_query.select_related( - "message", "message__sending_client" - ).filter(id__gt=min_id)[0:chunk_size] - user_message_chunk = list(actual_query) - user_message_ids = {um.id for um in user_message_chunk} - if len(user_message_chunk) == 0: - break + # Do a slim query to find all message ids that pertain to us. + # TODO: be more selective about which message ids we export. + all_message_ids = ( + UserMessage.objects.filter(user_profile=user_profile) + .values_list("message_id", flat=True) + .order_by("message_id") + ) + + dump_file_id = 1 + for message_id_chunk in chunkify(all_message_ids, chunk_size): + fat_query = ( + UserMessage.objects.select_related("message", "message__sending_client") + .filter(user_profile=user_profile, message_id__in=message_id_chunk) + .order_by("message_id") + ) + + user_message_chunk = list(fat_query) message_chunk = [] for user_message in user_message_chunk: @@ -2071,7 +2092,6 @@ def export_messages_single_user( floatify_datetime_fields(output, "zerver_message") write_table_data(message_filename, output) - min_id = max(user_message_ids) dump_file_id += 1