diff --git a/zerver/data_import/rocketchat.py b/zerver/data_import/rocketchat.py index 5897f694b7..2fb53473e3 100644 --- a/zerver/data_import/rocketchat.py +++ b/zerver/data_import/rocketchat.py @@ -1,7 +1,9 @@ import logging import os +import random +import secrets import subprocess -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Set, Tuple import bson from django.conf import settings @@ -10,6 +12,7 @@ from django.forms.models import model_to_dict from zerver.data_import.import_util import ( SubscriberHandler, ZerverFieldsT, + build_attachment, build_huddle, build_huddle_subscriptions, build_message, @@ -28,6 +31,8 @@ from zerver.data_import.import_util import ( from zerver.data_import.sequencer import NEXT_ID, IdMapper from zerver.data_import.user_handler import UserHandler from zerver.lib.emoji import name_to_codepoint +from zerver.lib.markdown import IMAGE_EXTENSIONS +from zerver.lib.upload import sanitize_name from zerver.lib.utils import process_list_in_batches from zerver.models import Reaction, RealmEmoji, Recipient, UserProfile @@ -344,6 +349,74 @@ def build_reactions( total_reactions.append(reaction_dict) +def process_message_attachment( + upload: Dict[str, Any], + realm_id: int, + message_id: int, + user_id: int, + user_handler: UserHandler, + zerver_attachment: List[ZerverFieldsT], + uploads_list: List[ZerverFieldsT], + upload_id_to_upload_data_map: Dict[str, Dict[str, Any]], + output_dir: str, +) -> Tuple[str, bool]: + upload_file_data = upload_id_to_upload_data_map[upload["_id"]] + file_name = upload["name"] + file_ext = f'.{upload["type"].split("/")[-1]}' + + has_image = False + if file_ext.lower() in IMAGE_EXTENSIONS: + has_image = True + + s3_path = "/".join( + [ + str(realm_id), + format(random.randint(0, 255), "x"), + secrets.token_urlsafe(18), + sanitize_name(file_name), + ] + ) + + # Build the attachment from chunks and save it to s3_path. + file_out_path = os.path.join(output_dir, "uploads", s3_path) + os.makedirs(os.path.dirname(file_out_path), exist_ok=True) + with open(file_out_path, "wb") as upload_file: + upload_file.write(b"".join(upload_file_data["chunk"])) + + attachment_content = ( + f'{upload_file_data["description"]}\n\n[{file_name}](/user_uploads/{s3_path})' + ) + + fileinfo = { + "name": file_name, + "size": upload_file_data["size"], + "created": float(upload_file_data["_updatedAt"].timestamp()), + } + + upload = dict( + path=s3_path, + realm_id=realm_id, + content_type=upload["type"], + user_profile_id=user_id, + last_modified=fileinfo["created"], + user_profile_email=user_handler.get_user(user_id=user_id)["email"], + s3_path=s3_path, + size=fileinfo["size"], + ) + uploads_list.append(upload) + + build_attachment( + realm_id=realm_id, + message_ids={message_id}, + user_id=user_id, + fileinfo=fileinfo, + s3_path=s3_path, + zerver_attachment=zerver_attachment, + ) + + return attachment_content, has_image + + def process_raw_message_batch( realm_id: int, raw_messages: List[Dict[str, Any]], @@ -353,6 +426,9 @@ def process_raw_message_batch( output_dir: str, zerver_realmemoji: List[ZerverFieldsT], total_reactions: List[ZerverFieldsT], + uploads_list: List[ZerverFieldsT], + zerver_attachment: List[ZerverFieldsT], + upload_id_to_upload_data_map: Dict[str, Dict[str, Any]], ) -> None: def fix_mentions(content: str, mention_user_ids: Set[int]) -> str: for user_id in mention_user_ids: @@ -390,6 +466,28 @@ def process_raw_message_batch( rendered_content = None + has_attachment = False + has_image = False + has_link = raw_message["has_link"] + + if "file" in raw_message: + has_attachment = True + has_link = True + + attachment_content, has_image = process_message_attachment( + upload=raw_message["file"], + realm_id=realm_id, + message_id=message_id, + user_id=sender_user_id, + user_handler=user_handler, + uploads_list=uploads_list, + zerver_attachment=zerver_attachment, + upload_id_to_upload_data_map=upload_id_to_upload_data_map, + output_dir=output_dir, + ) + + content += attachment_content + topic_name = raw_message["topic_name"] message = build_message( @@ -400,7 +498,9 @@ def process_raw_message_batch( rendered_content=rendered_content, topic_name=topic_name, user_id=sender_user_id, - has_attachment=False, + has_image=has_image, + has_link=has_link, + has_attachment=has_attachment, ) zerver_message.append(message) build_reactions( @@ -445,6 +545,9 @@ def process_messages( huddle_id_to_huddle_map: Dict[str, Dict[str, Any]], zerver_realmemoji: List[ZerverFieldsT], total_reactions: List[ZerverFieldsT], + uploads_list: List[ZerverFieldsT], + zerver_attachment: List[ZerverFieldsT], + upload_id_to_upload_data_map: Dict[str, Dict[str, Any]], output_dir: str, ) -> None: def list_reactions(reactions: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]: @@ -477,6 +580,7 @@ def process_messages( content=content, date_sent=int(message["ts"].timestamp()), reactions=reactions, + has_link=True if message.get("urls") else False, ) # Add recipient_id and topic to message_dict @@ -518,6 +622,10 @@ def process_messages( mention_user_ids.add(user_id) message_dict["mention_user_ids"] = mention_user_ids + # Add uploaded file (attachment) to message_dict + if message.get("file"): + message_dict["file"] = message["file"] + return message_dict raw_messages: List[Dict[str, Any]] = [] @@ -538,6 +646,9 @@ def process_messages( output_dir=output_dir, zerver_realmemoji=zerver_realmemoji, total_reactions=total_reactions, + uploads_list=uploads_list, + zerver_attachment=zerver_attachment, + upload_id_to_upload_data_map=upload_id_to_upload_data_map, ) chunk_size = 1000 @@ -549,6 +660,20 @@ def process_messages( ) +def map_upload_id_to_upload_data( + upload_data: Dict[str, List[Dict[str, Any]]], +) -> Dict[str, Dict[str, Any]]: + upload_id_to_upload_data_map: Dict[str, Dict[str, Any]] = {} + + for upload in upload_data["upload"]: + upload_id_to_upload_data_map[upload["_id"]] = {**upload, "chunk": []} + + for chunk in upload_data["chunk"]: + upload_id_to_upload_data_map[chunk["files_id"]]["chunk"].append(chunk["data"]) + + return upload_id_to_upload_data_map + + def separate_channel_and_private_messages( messages: List[Dict[str, Any]], direct_id_to_direct_map: Dict[str, Dict[str, Any]], @@ -630,6 +755,7 @@ def rocketchat_data_to_dict(rocketchat_data_dir: str) -> Dict[str, Any]: rocketchat_data["room"] = [] rocketchat_data["message"] = [] rocketchat_data["custom_emoji"] = {"emoji": [], "file": [], "chunk": []} + rocketchat_data["upload"] = {"upload": [], "file": [], "chunk": []} # Get instance with open(os.path.join(rocketchat_data_dir, "instances.bson"), "rb") as fcache: @@ -673,6 +799,21 @@ def rocketchat_data_to_dict(rocketchat_data_dir: str) -> Dict[str, Any]: with open(os.path.join(rocketchat_data_dir, "custom_emoji.chunks.bson"), "rb") as fcache: rocketchat_data["custom_emoji"]["chunk"] = bson.decode_all(fcache.read()) + # Get uploads + with open(os.path.join(rocketchat_data_dir, "rocketchat_uploads.bson"), "rb") as fcache: + rocketchat_data["upload"]["upload"] = bson.decode_all(fcache.read()) + + if rocketchat_data["upload"]["upload"]: + with open( + os.path.join(rocketchat_data_dir, "rocketchat_uploads.files.bson"), "rb" + ) as fcache: + rocketchat_data["upload"]["file"] = bson.decode_all(fcache.read()) + + with open( + os.path.join(rocketchat_data_dir, "rocketchat_uploads.chunks.bson"), "rb" + ) as fcache: + rocketchat_data["upload"]["chunk"] = bson.decode_all(fcache.read()) + return rocketchat_data @@ -807,6 +948,10 @@ def do_convert_data(rocketchat_data_dir: str, output_dir: str) -> None: ) total_reactions: List[ZerverFieldsT] = [] + uploads_list: List[ZerverFieldsT] = [] + zerver_attachment: List[ZerverFieldsT] = [] + + upload_id_to_upload_data_map = map_upload_id_to_upload_data(rocketchat_data["upload"]) # Process channel messages process_messages( @@ -827,6 +972,9 @@ def do_convert_data(rocketchat_data_dir: str, output_dir: str) -> None: huddle_id_to_huddle_map=huddle_id_to_huddle_map, zerver_realmemoji=zerver_realmemoji, total_reactions=total_reactions, + uploads_list=uploads_list, + zerver_attachment=zerver_attachment, + upload_id_to_upload_data_map=upload_id_to_upload_data_map, output_dir=output_dir, ) # Process private messages @@ -848,6 +996,9 @@ def do_convert_data(rocketchat_data_dir: str, output_dir: str) -> None: huddle_id_to_huddle_map=huddle_id_to_huddle_map, zerver_realmemoji=zerver_realmemoji, total_reactions=total_reactions, + uploads_list=uploads_list, + zerver_attachment=zerver_attachment, + upload_id_to_upload_data_map=upload_id_to_upload_data_map, output_dir=output_dir, ) realm["zerver_reaction"] = total_reactions @@ -857,12 +1008,11 @@ def do_convert_data(rocketchat_data_dir: str, output_dir: str) -> None: create_converted_data_files(realm, output_dir, "/realm.json") # TODO: Add support for importing avatars create_converted_data_files([], output_dir, "/avatars/records.json") - # TODO: Add support for importing uploads - create_converted_data_files([], output_dir, "/uploads/records.json") - # TODO: Add support for importing attachments - attachment: Dict[str, List[Any]] = {"zerver_attachment": []} + # Import attachments + attachment: Dict[str, List[Any]] = {"zerver_attachment": zerver_attachment} create_converted_data_files(attachment, output_dir, "/attachment.json") + create_converted_data_files(uploads_list, output_dir, "/uploads/records.json") logging.info("Start making tarball") subprocess.check_call(["tar", "-czf", output_dir + ".tar.gz", output_dir, "-P"]) diff --git a/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_message.bson b/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_message.bson index efdd9761b2..12425f49e3 100644 Binary files a/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_message.bson and b/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_message.bson differ diff --git a/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_uploads.bson b/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_uploads.bson new file mode 100644 index 0000000000..80cbfa9195 Binary files /dev/null and b/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_uploads.bson differ diff --git a/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_uploads.chunks.bson b/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_uploads.chunks.bson new file mode 100644 index 0000000000..999ef8a73e Binary files /dev/null and b/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_uploads.chunks.bson differ diff --git a/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_uploads.files.bson b/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_uploads.files.bson new file mode 100644 index 0000000000..2dccc4e9ad Binary files /dev/null and b/zerver/tests/fixtures/rocketchat_fixtures/rocketchat_uploads.files.bson differ diff --git a/zerver/tests/test_rocketchat_importer.py b/zerver/tests/test_rocketchat_importer.py index 8f96d694e2..7ad943c818 100644 --- a/zerver/tests/test_rocketchat_importer.py +++ b/zerver/tests/test_rocketchat_importer.py @@ -14,8 +14,10 @@ from zerver.data_import.rocketchat import ( convert_stream_subscription_data, do_convert_data, map_receiver_id_to_recipient_id, + map_upload_id_to_upload_data, map_user_id_to_user, map_username_to_user_id, + process_message_attachment, process_users, rocketchat_data_to_dict, separate_channel_and_private_messages, @@ -32,7 +34,7 @@ class RocketChatImporter(ZulipTestCase): def test_rocketchat_data_to_dict(self) -> None: fixture_dir_name = self.fixture_file_name("", "rocketchat_fixtures") rocketchat_data = rocketchat_data_to_dict(fixture_dir_name) - self.assert_length(rocketchat_data, 6) + self.assert_length(rocketchat_data, 7) self.assert_length(rocketchat_data["user"], 6) self.assertEqual(rocketchat_data["user"][2]["username"], "harry.potter") @@ -42,7 +44,7 @@ class RocketChatImporter(ZulipTestCase): self.assertEqual(rocketchat_data["room"][0]["_id"], "GENERAL") self.assertEqual(rocketchat_data["room"][0]["name"], "general") - self.assert_length(rocketchat_data["message"], 58) + self.assert_length(rocketchat_data["message"], 63) self.assertEqual(rocketchat_data["message"][1]["msg"], "Hey everyone, how's it going??") self.assertEqual(rocketchat_data["message"][1]["rid"], "GENERAL") self.assertEqual(rocketchat_data["message"][1]["u"]["username"], "priyansh3133") @@ -50,6 +52,9 @@ class RocketChatImporter(ZulipTestCase): self.assert_length(rocketchat_data["custom_emoji"]["emoji"], 3) self.assertEqual(rocketchat_data["custom_emoji"]["emoji"][0]["name"], "tick") + self.assert_length(rocketchat_data["upload"]["upload"], 4) + self.assertEqual(rocketchat_data["upload"]["upload"][0]["name"], "harry-ron.jpg") + def test_map_user_id_to_user(self) -> None: fixture_dir_name = self.fixture_file_name("", "rocketchat_fixtures") rocketchat_data = rocketchat_data_to_dict(fixture_dir_name) @@ -613,9 +618,9 @@ class RocketChatImporter(ZulipTestCase): private_messages=private_messages, ) - self.assert_length(rocketchat_data["message"], 58) - self.assert_length(channel_messages, 53) - self.assert_length(private_messages, 5) + self.assert_length(rocketchat_data["message"], 63) + self.assert_length(channel_messages, 57) + self.assert_length(private_messages, 6) self.assertIn(rocketchat_data["message"][0], channel_messages) self.assertIn(rocketchat_data["message"][1], channel_messages) @@ -661,8 +666,22 @@ class RocketChatImporter(ZulipTestCase): ) # No new message added to channel or private messages - self.assert_length(channel_messages, 53) - self.assert_length(private_messages, 5) + self.assert_length(channel_messages, 57) + self.assert_length(private_messages, 6) + + def test_map_upload_id_to_upload_data(self) -> None: + fixture_dir_name = self.fixture_file_name("", "rocketchat_fixtures") + rocketchat_data = rocketchat_data_to_dict(fixture_dir_name) + + upload_id_to_upload_data_map = map_upload_id_to_upload_data(rocketchat_data["upload"]) + + self.assert_length(rocketchat_data["upload"]["upload"], 4) + self.assert_length(upload_id_to_upload_data_map, 4) + + upload_id = rocketchat_data["upload"]["upload"][0]["_id"] + upload_name = rocketchat_data["upload"]["upload"][0]["name"] + self.assertEqual(upload_id_to_upload_data_map[upload_id]["name"], upload_name) + self.assert_length(upload_id_to_upload_data_map[upload_id]["chunk"], 1) def test_build_reactions(self) -> None: fixture_dir_name = self.fixture_file_name("", "rocketchat_fixtures") @@ -734,6 +753,62 @@ class RocketChatImporter(ZulipTestCase): self.assert_length(self.get_set(total_reactions, "id"), 7) self.assert_length(self.get_set(total_reactions, "message"), 1) + def test_process_message_attachment(self) -> None: + fixture_dir_name = self.fixture_file_name("", "rocketchat_fixtures") + rocketchat_data = rocketchat_data_to_dict(fixture_dir_name) + output_dir = self.make_import_output_dir("mattermost") + + user_id_to_user_map = map_user_id_to_user(rocketchat_data["user"]) + + realm_id = 3 + domain_name = "zulip.com" + + user_handler = UserHandler() + user_id_mapper = IdMapper() + + process_users( + user_id_to_user_map=user_id_to_user_map, + realm_id=realm_id, + domain_name=domain_name, + user_handler=user_handler, + user_id_mapper=user_id_mapper, + ) + + zerver_attachments: List[ZerverFieldsT] = [] + uploads_list: List[ZerverFieldsT] = [] + + upload_id_to_upload_data_map = map_upload_id_to_upload_data(rocketchat_data["upload"]) + + message_with_attachment = rocketchat_data["message"][55] + + process_message_attachment( + upload=message_with_attachment["file"], + realm_id=3, + message_id=1, + user_id=3, + user_handler=user_handler, + zerver_attachment=zerver_attachments, + uploads_list=uploads_list, + upload_id_to_upload_data_map=upload_id_to_upload_data_map, + output_dir=output_dir, + ) + + self.assert_length(zerver_attachments, 1) + self.assertEqual(zerver_attachments[0]["file_name"], "harry-ron.jpg") + self.assertEqual(zerver_attachments[0]["owner"], 3) + self.assertEqual( + user_handler.get_user(zerver_attachments[0]["owner"])["email"], "harrypotter@email.com" + ) + # TODO: Assert this for False after fixing the file permissions in PMs + self.assertTrue(zerver_attachments[0]["is_realm_public"]) + + self.assert_length(uploads_list, 1) + self.assertEqual(uploads_list[0]["user_profile_email"], "harrypotter@email.com") + + attachment_out_path = os.path.join(output_dir, "uploads", zerver_attachments[0]["path_id"]) + self.assertTrue(os.path.exists(attachment_out_path)) + self.assertTrue(os.path.isfile(attachment_out_path)) + def read_file(self, team_output_dir: str, output_file: str) -> Any: full_path = os.path.join(team_output_dir, output_file) with open(full_path, "rb") as f: @@ -760,6 +835,7 @@ class RocketChatImporter(ZulipTestCase): self.assertEqual(os.path.exists(os.path.join(output_dir, "avatars")), True) self.assertEqual(os.path.exists(os.path.join(output_dir, "emoji")), True) + self.assertEqual(os.path.exists(os.path.join(output_dir, "uploads")), True) self.assertEqual(os.path.exists(os.path.join(output_dir, "attachment.json")), True) realm = self.read_file(output_dir, "realm.json") @@ -867,23 +943,41 @@ class RocketChatImporter(ZulipTestCase): for message in messages: self.assertIsNotNone(message.rendered_content) # After removing user_joined, added_user, discussion_created, etc. - # messages. (Total messages were 58.) - self.assert_length(messages, 31) + # messages. (Total messages were 63.) + self.assert_length(messages, 36) stream_messages = messages.filter(recipient__type=Recipient.STREAM).order_by("date_sent") stream_recipients = stream_messages.values_list("recipient", flat=True) - self.assert_length(stream_messages, 26) + self.assert_length(stream_messages, 30) self.assert_length(set(stream_recipients), 5) self.assertEqual(stream_messages[0].sender.email, "priyansh3133@email.com") self.assertEqual(stream_messages[0].content, "Hey everyone, how's it going??") + self.assertEqual(stream_messages[23].sender.email, "harrypotter@email.com") + self.assertRegex( + stream_messages[23].content, + "Just a random pic!\n\n\\[harry-ron.jpg\\]\\(.*\\)", + ) + self.assertTrue(stream_messages[23].has_attachment) + self.assertTrue(stream_messages[23].has_image) + self.assertTrue(stream_messages[23].has_link) + huddle_messages = messages.filter(recipient__type=Recipient.HUDDLE).order_by("date_sent") huddle_recipients = huddle_messages.values_list("recipient", flat=True) - self.assert_length(huddle_messages, 2) + self.assert_length(huddle_messages, 3) self.assert_length(set(huddle_recipients), 1) self.assertEqual(huddle_messages[0].sender.email, "hermionegranger@email.com") self.assertEqual(huddle_messages[0].content, "Hey people!") + self.assertEqual(huddle_messages[2].sender.email, "harrypotter@email.com") + self.assertRegex( + huddle_messages[2].content, + "This year's curriculum is out.\n\n\\[Hogwarts Curriculum.pdf\\]\\(.*\\)", + ) + self.assertTrue(huddle_messages[2].has_attachment) + self.assertFalse(huddle_messages[2].has_image) + self.assertTrue(huddle_messages[2].has_link) + personal_messages = messages.filter(recipient__type=Recipient.PERSONAL).order_by( "date_sent" )