data_import: Import attachments from Rocket.Chat.

This commit is contained in:
Priyansh Garg 2021-08-04 21:14:30 +05:30 committed by Tim Abbott
parent 65e28907cb
commit aed4e48da7
6 changed files with 261 additions and 17 deletions

View File

@ -1,7 +1,9 @@
import logging
import os
import random
import secrets
import subprocess
from typing import Any, Dict, List, Set
from typing import Any, Dict, List, Set, Tuple
import bson
from django.conf import settings
@ -10,6 +12,7 @@ from django.forms.models import model_to_dict
from zerver.data_import.import_util import (
SubscriberHandler,
ZerverFieldsT,
build_attachment,
build_huddle,
build_huddle_subscriptions,
build_message,
@ -28,6 +31,8 @@ from zerver.data_import.import_util import (
from zerver.data_import.sequencer import NEXT_ID, IdMapper
from zerver.data_import.user_handler import UserHandler
from zerver.lib.emoji import name_to_codepoint
from zerver.lib.markdown import IMAGE_EXTENSIONS
from zerver.lib.upload import sanitize_name
from zerver.lib.utils import process_list_in_batches
from zerver.models import Reaction, RealmEmoji, Recipient, UserProfile
@ -344,6 +349,74 @@ def build_reactions(
total_reactions.append(reaction_dict)
def process_message_attachment(
upload: Dict[str, Any],
realm_id: int,
message_id: int,
user_id: int,
user_handler: UserHandler,
zerver_attachment: List[ZerverFieldsT],
uploads_list: List[ZerverFieldsT],
upload_id_to_upload_data_map: Dict[str, Dict[str, Any]],
output_dir: str,
) -> Tuple[str, bool]:
upload_file_data = upload_id_to_upload_data_map[upload["_id"]]
file_name = upload["name"]
file_ext = f'.{upload["type"].split("/")[-1]}'
has_image = False
if file_ext.lower() in IMAGE_EXTENSIONS:
has_image = True
s3_path = "/".join(
[
str(realm_id),
format(random.randint(0, 255), "x"),
secrets.token_urlsafe(18),
sanitize_name(file_name),
]
)
# Build the attachment from chunks and save it to s3_path.
file_out_path = os.path.join(output_dir, "uploads", s3_path)
os.makedirs(os.path.dirname(file_out_path), exist_ok=True)
with open(file_out_path, "wb") as upload_file:
upload_file.write(b"".join(upload_file_data["chunk"]))
attachment_content = (
f'{upload_file_data["description"]}\n\n[{file_name}](/user_uploads/{s3_path})'
)
fileinfo = {
"name": file_name,
"size": upload_file_data["size"],
"created": float(upload_file_data["_updatedAt"].timestamp()),
}
upload = dict(
path=s3_path,
realm_id=realm_id,
content_type=upload["type"],
user_profile_id=user_id,
last_modified=fileinfo["created"],
user_profile_email=user_handler.get_user(user_id=user_id)["email"],
s3_path=s3_path,
size=fileinfo["size"],
)
uploads_list.append(upload)
build_attachment(
realm_id=realm_id,
message_ids={message_id},
user_id=user_id,
fileinfo=fileinfo,
s3_path=s3_path,
zerver_attachment=zerver_attachment,
)
return attachment_content, has_image
def process_raw_message_batch(
realm_id: int,
raw_messages: List[Dict[str, Any]],
@ -353,6 +426,9 @@ def process_raw_message_batch(
output_dir: str,
zerver_realmemoji: List[ZerverFieldsT],
total_reactions: List[ZerverFieldsT],
uploads_list: List[ZerverFieldsT],
zerver_attachment: List[ZerverFieldsT],
upload_id_to_upload_data_map: Dict[str, Dict[str, Any]],
) -> None:
def fix_mentions(content: str, mention_user_ids: Set[int]) -> str:
for user_id in mention_user_ids:
@ -390,6 +466,28 @@ def process_raw_message_batch(
rendered_content = None
has_attachment = False
has_image = False
has_link = raw_message["has_link"]
if "file" in raw_message:
has_attachment = True
has_link = True
attachment_content, has_image = process_message_attachment(
upload=raw_message["file"],
realm_id=realm_id,
message_id=message_id,
user_id=sender_user_id,
user_handler=user_handler,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
upload_id_to_upload_data_map=upload_id_to_upload_data_map,
output_dir=output_dir,
)
content += attachment_content
topic_name = raw_message["topic_name"]
message = build_message(
@ -400,7 +498,9 @@ def process_raw_message_batch(
rendered_content=rendered_content,
topic_name=topic_name,
user_id=sender_user_id,
has_attachment=False,
has_image=has_image,
has_link=has_link,
has_attachment=has_attachment,
)
zerver_message.append(message)
build_reactions(
@ -445,6 +545,9 @@ def process_messages(
huddle_id_to_huddle_map: Dict[str, Dict[str, Any]],
zerver_realmemoji: List[ZerverFieldsT],
total_reactions: List[ZerverFieldsT],
uploads_list: List[ZerverFieldsT],
zerver_attachment: List[ZerverFieldsT],
upload_id_to_upload_data_map: Dict[str, Dict[str, Any]],
output_dir: str,
) -> None:
def list_reactions(reactions: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
@ -477,6 +580,7 @@ def process_messages(
content=content,
date_sent=int(message["ts"].timestamp()),
reactions=reactions,
has_link=True if message.get("urls") else False,
)
# Add recipient_id and topic to message_dict
@ -518,6 +622,10 @@ def process_messages(
mention_user_ids.add(user_id)
message_dict["mention_user_ids"] = mention_user_ids
# Add uploaded file (attachment) to message_dict
if message.get("file"):
message_dict["file"] = message["file"]
return message_dict
raw_messages: List[Dict[str, Any]] = []
@ -538,6 +646,9 @@ def process_messages(
output_dir=output_dir,
zerver_realmemoji=zerver_realmemoji,
total_reactions=total_reactions,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
upload_id_to_upload_data_map=upload_id_to_upload_data_map,
)
chunk_size = 1000
@ -549,6 +660,20 @@ def process_messages(
)
def map_upload_id_to_upload_data(
upload_data: Dict[str, List[Dict[str, Any]]],
) -> Dict[str, Dict[str, Any]]:
upload_id_to_upload_data_map: Dict[str, Dict[str, Any]] = {}
for upload in upload_data["upload"]:
upload_id_to_upload_data_map[upload["_id"]] = {**upload, "chunk": []}
for chunk in upload_data["chunk"]:
upload_id_to_upload_data_map[chunk["files_id"]]["chunk"].append(chunk["data"])
return upload_id_to_upload_data_map
def separate_channel_and_private_messages(
messages: List[Dict[str, Any]],
direct_id_to_direct_map: Dict[str, Dict[str, Any]],
@ -630,6 +755,7 @@ def rocketchat_data_to_dict(rocketchat_data_dir: str) -> Dict[str, Any]:
rocketchat_data["room"] = []
rocketchat_data["message"] = []
rocketchat_data["custom_emoji"] = {"emoji": [], "file": [], "chunk": []}
rocketchat_data["upload"] = {"upload": [], "file": [], "chunk": []}
# Get instance
with open(os.path.join(rocketchat_data_dir, "instances.bson"), "rb") as fcache:
@ -673,6 +799,21 @@ def rocketchat_data_to_dict(rocketchat_data_dir: str) -> Dict[str, Any]:
with open(os.path.join(rocketchat_data_dir, "custom_emoji.chunks.bson"), "rb") as fcache:
rocketchat_data["custom_emoji"]["chunk"] = bson.decode_all(fcache.read())
# Get uploads
with open(os.path.join(rocketchat_data_dir, "rocketchat_uploads.bson"), "rb") as fcache:
rocketchat_data["upload"]["upload"] = bson.decode_all(fcache.read())
if rocketchat_data["upload"]["upload"]:
with open(
os.path.join(rocketchat_data_dir, "rocketchat_uploads.files.bson"), "rb"
) as fcache:
rocketchat_data["upload"]["file"] = bson.decode_all(fcache.read())
with open(
os.path.join(rocketchat_data_dir, "rocketchat_uploads.chunks.bson"), "rb"
) as fcache:
rocketchat_data["upload"]["chunk"] = bson.decode_all(fcache.read())
return rocketchat_data
@ -807,6 +948,10 @@ def do_convert_data(rocketchat_data_dir: str, output_dir: str) -> None:
)
total_reactions: List[ZerverFieldsT] = []
uploads_list: List[ZerverFieldsT] = []
zerver_attachment: List[ZerverFieldsT] = []
upload_id_to_upload_data_map = map_upload_id_to_upload_data(rocketchat_data["upload"])
# Process channel messages
process_messages(
@ -827,6 +972,9 @@ def do_convert_data(rocketchat_data_dir: str, output_dir: str) -> None:
huddle_id_to_huddle_map=huddle_id_to_huddle_map,
zerver_realmemoji=zerver_realmemoji,
total_reactions=total_reactions,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
upload_id_to_upload_data_map=upload_id_to_upload_data_map,
output_dir=output_dir,
)
# Process private messages
@ -848,6 +996,9 @@ def do_convert_data(rocketchat_data_dir: str, output_dir: str) -> None:
huddle_id_to_huddle_map=huddle_id_to_huddle_map,
zerver_realmemoji=zerver_realmemoji,
total_reactions=total_reactions,
uploads_list=uploads_list,
zerver_attachment=zerver_attachment,
upload_id_to_upload_data_map=upload_id_to_upload_data_map,
output_dir=output_dir,
)
realm["zerver_reaction"] = total_reactions
@ -857,12 +1008,11 @@ def do_convert_data(rocketchat_data_dir: str, output_dir: str) -> None:
create_converted_data_files(realm, output_dir, "/realm.json")
# TODO: Add support for importing avatars
create_converted_data_files([], output_dir, "/avatars/records.json")
# TODO: Add support for importing uploads
create_converted_data_files([], output_dir, "/uploads/records.json")
# TODO: Add support for importing attachments
attachment: Dict[str, List[Any]] = {"zerver_attachment": []}
# Import attachments
attachment: Dict[str, List[Any]] = {"zerver_attachment": zerver_attachment}
create_converted_data_files(attachment, output_dir, "/attachment.json")
create_converted_data_files(uploads_list, output_dir, "/uploads/records.json")
logging.info("Start making tarball")
subprocess.check_call(["tar", "-czf", output_dir + ".tar.gz", output_dir, "-P"])

Binary file not shown.

View File

@ -14,8 +14,10 @@ from zerver.data_import.rocketchat import (
convert_stream_subscription_data,
do_convert_data,
map_receiver_id_to_recipient_id,
map_upload_id_to_upload_data,
map_user_id_to_user,
map_username_to_user_id,
process_message_attachment,
process_users,
rocketchat_data_to_dict,
separate_channel_and_private_messages,
@ -32,7 +34,7 @@ class RocketChatImporter(ZulipTestCase):
def test_rocketchat_data_to_dict(self) -> None:
fixture_dir_name = self.fixture_file_name("", "rocketchat_fixtures")
rocketchat_data = rocketchat_data_to_dict(fixture_dir_name)
self.assert_length(rocketchat_data, 6)
self.assert_length(rocketchat_data, 7)
self.assert_length(rocketchat_data["user"], 6)
self.assertEqual(rocketchat_data["user"][2]["username"], "harry.potter")
@ -42,7 +44,7 @@ class RocketChatImporter(ZulipTestCase):
self.assertEqual(rocketchat_data["room"][0]["_id"], "GENERAL")
self.assertEqual(rocketchat_data["room"][0]["name"], "general")
self.assert_length(rocketchat_data["message"], 58)
self.assert_length(rocketchat_data["message"], 63)
self.assertEqual(rocketchat_data["message"][1]["msg"], "Hey everyone, how's it going??")
self.assertEqual(rocketchat_data["message"][1]["rid"], "GENERAL")
self.assertEqual(rocketchat_data["message"][1]["u"]["username"], "priyansh3133")
@ -50,6 +52,9 @@ class RocketChatImporter(ZulipTestCase):
self.assert_length(rocketchat_data["custom_emoji"]["emoji"], 3)
self.assertEqual(rocketchat_data["custom_emoji"]["emoji"][0]["name"], "tick")
self.assert_length(rocketchat_data["upload"]["upload"], 4)
self.assertEqual(rocketchat_data["upload"]["upload"][0]["name"], "harry-ron.jpg")
def test_map_user_id_to_user(self) -> None:
fixture_dir_name = self.fixture_file_name("", "rocketchat_fixtures")
rocketchat_data = rocketchat_data_to_dict(fixture_dir_name)
@ -613,9 +618,9 @@ class RocketChatImporter(ZulipTestCase):
private_messages=private_messages,
)
self.assert_length(rocketchat_data["message"], 58)
self.assert_length(channel_messages, 53)
self.assert_length(private_messages, 5)
self.assert_length(rocketchat_data["message"], 63)
self.assert_length(channel_messages, 57)
self.assert_length(private_messages, 6)
self.assertIn(rocketchat_data["message"][0], channel_messages)
self.assertIn(rocketchat_data["message"][1], channel_messages)
@ -661,8 +666,22 @@ class RocketChatImporter(ZulipTestCase):
)
# No new message added to channel or private messages
self.assert_length(channel_messages, 53)
self.assert_length(private_messages, 5)
self.assert_length(channel_messages, 57)
self.assert_length(private_messages, 6)
def test_map_upload_id_to_upload_data(self) -> None:
fixture_dir_name = self.fixture_file_name("", "rocketchat_fixtures")
rocketchat_data = rocketchat_data_to_dict(fixture_dir_name)
upload_id_to_upload_data_map = map_upload_id_to_upload_data(rocketchat_data["upload"])
self.assert_length(rocketchat_data["upload"]["upload"], 4)
self.assert_length(upload_id_to_upload_data_map, 4)
upload_id = rocketchat_data["upload"]["upload"][0]["_id"]
upload_name = rocketchat_data["upload"]["upload"][0]["name"]
self.assertEqual(upload_id_to_upload_data_map[upload_id]["name"], upload_name)
self.assert_length(upload_id_to_upload_data_map[upload_id]["chunk"], 1)
def test_build_reactions(self) -> None:
fixture_dir_name = self.fixture_file_name("", "rocketchat_fixtures")
@ -734,6 +753,62 @@ class RocketChatImporter(ZulipTestCase):
self.assert_length(self.get_set(total_reactions, "id"), 7)
self.assert_length(self.get_set(total_reactions, "message"), 1)
def test_process_message_attachment(self) -> None:
fixture_dir_name = self.fixture_file_name("", "rocketchat_fixtures")
rocketchat_data = rocketchat_data_to_dict(fixture_dir_name)
output_dir = self.make_import_output_dir("mattermost")
user_id_to_user_map = map_user_id_to_user(rocketchat_data["user"])
realm_id = 3
domain_name = "zulip.com"
user_handler = UserHandler()
user_id_mapper = IdMapper()
process_users(
user_id_to_user_map=user_id_to_user_map,
realm_id=realm_id,
domain_name=domain_name,
user_handler=user_handler,
user_id_mapper=user_id_mapper,
)
zerver_attachments: List[ZerverFieldsT] = []
uploads_list: List[ZerverFieldsT] = []
upload_id_to_upload_data_map = map_upload_id_to_upload_data(rocketchat_data["upload"])
message_with_attachment = rocketchat_data["message"][55]
process_message_attachment(
upload=message_with_attachment["file"],
realm_id=3,
message_id=1,
user_id=3,
user_handler=user_handler,
zerver_attachment=zerver_attachments,
uploads_list=uploads_list,
upload_id_to_upload_data_map=upload_id_to_upload_data_map,
output_dir=output_dir,
)
self.assert_length(zerver_attachments, 1)
self.assertEqual(zerver_attachments[0]["file_name"], "harry-ron.jpg")
self.assertEqual(zerver_attachments[0]["owner"], 3)
self.assertEqual(
user_handler.get_user(zerver_attachments[0]["owner"])["email"], "harrypotter@email.com"
)
# TODO: Assert this for False after fixing the file permissions in PMs
self.assertTrue(zerver_attachments[0]["is_realm_public"])
self.assert_length(uploads_list, 1)
self.assertEqual(uploads_list[0]["user_profile_email"], "harrypotter@email.com")
attachment_out_path = os.path.join(output_dir, "uploads", zerver_attachments[0]["path_id"])
self.assertTrue(os.path.exists(attachment_out_path))
self.assertTrue(os.path.isfile(attachment_out_path))
def read_file(self, team_output_dir: str, output_file: str) -> Any:
full_path = os.path.join(team_output_dir, output_file)
with open(full_path, "rb") as f:
@ -760,6 +835,7 @@ class RocketChatImporter(ZulipTestCase):
self.assertEqual(os.path.exists(os.path.join(output_dir, "avatars")), True)
self.assertEqual(os.path.exists(os.path.join(output_dir, "emoji")), True)
self.assertEqual(os.path.exists(os.path.join(output_dir, "uploads")), True)
self.assertEqual(os.path.exists(os.path.join(output_dir, "attachment.json")), True)
realm = self.read_file(output_dir, "realm.json")
@ -867,23 +943,41 @@ class RocketChatImporter(ZulipTestCase):
for message in messages:
self.assertIsNotNone(message.rendered_content)
# After removing user_joined, added_user, discussion_created, etc.
# messages. (Total messages were 58.)
self.assert_length(messages, 31)
# messages. (Total messages were 63.)
self.assert_length(messages, 36)
stream_messages = messages.filter(recipient__type=Recipient.STREAM).order_by("date_sent")
stream_recipients = stream_messages.values_list("recipient", flat=True)
self.assert_length(stream_messages, 26)
self.assert_length(stream_messages, 30)
self.assert_length(set(stream_recipients), 5)
self.assertEqual(stream_messages[0].sender.email, "priyansh3133@email.com")
self.assertEqual(stream_messages[0].content, "Hey everyone, how's it going??")
self.assertEqual(stream_messages[23].sender.email, "harrypotter@email.com")
self.assertRegex(
stream_messages[23].content,
"Just a random pic!\n\n\\[harry-ron.jpg\\]\\(.*\\)",
)
self.assertTrue(stream_messages[23].has_attachment)
self.assertTrue(stream_messages[23].has_image)
self.assertTrue(stream_messages[23].has_link)
huddle_messages = messages.filter(recipient__type=Recipient.HUDDLE).order_by("date_sent")
huddle_recipients = huddle_messages.values_list("recipient", flat=True)
self.assert_length(huddle_messages, 2)
self.assert_length(huddle_messages, 3)
self.assert_length(set(huddle_recipients), 1)
self.assertEqual(huddle_messages[0].sender.email, "hermionegranger@email.com")
self.assertEqual(huddle_messages[0].content, "Hey people!")
self.assertEqual(huddle_messages[2].sender.email, "harrypotter@email.com")
self.assertRegex(
huddle_messages[2].content,
"This year's curriculum is out.\n\n\\[Hogwarts Curriculum.pdf\\]\\(.*\\)",
)
self.assertTrue(huddle_messages[2].has_attachment)
self.assertFalse(huddle_messages[2].has_image)
self.assertTrue(huddle_messages[2].has_link)
personal_messages = messages.filter(recipient__type=Recipient.PERSONAL).order_by(
"date_sent"
)