""" file: telegram-chat-parser.py author: Artur Rodrigues Rocha Neto email: artur.rodrigues26@gmail.com github: https://github.com/keizerzilla created: 23/12/2020 description: Script to parse a Telegram chat history JSON file into a tabular format (CSV). requirements: Python 3.x """ import re import sys import csv import json from datetime import datetime columns = [ "msg_id", "sender", "sender_id", "reply_to_msg_id", "date", "msg_type", "msg_text", "msg_content", "is_edited", "edit_date", "has_mention", "has_email", "has_phone", "has_hashtag", "is_forwarded", "is_bot_command", ] file_types = [ "animation", "video_file", "video_message", "voice_message", "audio_file", ] mention_types = [ "mention", "mention_name", ] null_name_counter = 0 def parse_telegram_to_csv(jdata): if jdata.get("name") is None: global null_name_counter null_name_counter += 1 chat_name = f"UnnamedChat-{null_name_counter}" else: chat_name = re.sub(r'[\W_]+', u'', jdata.get("name"), flags=re.UNICODE) output_filepath = f"{chat_name}.csv" with open(output_filepath, "w", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, columns, dialect="unix", quoting=csv.QUOTE_NONNUMERIC) writer.writeheader() for message in jdata["messages"]: if message["type"] != "message": continue msg_id = message["id"] sender = message["from"] sender_id = message["from_id"] reply_to_msg_id = message["reply_to_message_id"] if "reply_to_message_id" in message else -1 date = message["date"].replace("T", " ") dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S") msg_content = message["text"] msg_text = message["text"] msg_type = "text" if "media_type" in message: msg_type = message["media_type"] if message["media_type"] == "sticker": if "sticker_emoji" in message: msg_content = message["file"] else: msg_content = "?" elif message["media_type"] in file_types: msg_content = message["file"] elif "file" in message: msg_type = "file" msg_content = message["file"] if "photo" in message: msg_type = "photo" msg_content = message["photo"] elif "poll" in message: msg_type = "poll" msg_content = str(message["poll"]["total_voters"]) elif "location_information" in message: msg_type = "location" loc = message["location_information"] msg_content = str(loc["latitude"]) + "," + str(loc["longitude"]) has_mention = 0 has_email = 0 has_phone = 0 has_hashtag = 0 is_bot_command = 0 if type(msg_content) == list: txt_content = "" for part in msg_content: if type(part) == str: txt_content += part elif type(part) == dict: if part["type"] == "link": msg_type = "link" elif part["type"] in mention_types: has_mention = 1 elif part["type"] == "email": has_email = 1 elif part["type"] == "phone": has_phone = 1 elif part["type"] == "hashtag": has_hashtag = 1 elif part["type"] == "bot_command": is_bot_command = 1 txt_content += part["text"] msg_content = txt_content msg_text = txt_content msg_content = msg_content.replace("\n", " ") is_edited = 0 edit_date = 0 if "edited" in message: edit_date = message["edited"].replace("T", " ") dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S") is_edited = 1 is_forwarded = 0 if "is_forwarded" in message: is_forwarded = 1 row = { "msg_id" : msg_id, "sender" : sender, "sender_id" : sender_id, "reply_to_msg_id": reply_to_msg_id, "date" : date, "msg_type" : msg_type, "msg_text" : msg_text, "msg_content" : msg_content, "is_edited" : is_edited, "edit_date" : edit_date, "has_mention" : has_mention, "has_email" : has_email, "has_phone" : has_phone, "has_hashtag" : has_hashtag, "is_forwarded": is_forwarded, "is_bot_command" : is_bot_command, } writer.writerow(row) print(chat_name, "OK!") if __name__ == "__main__": if len(sys.argv) != 2: print("ERROR: incorrect number of arguments!") print("How to use it:") print(" python3 telegram-chat-parser.py ") print("Example:") print(" python3 telegram-chat-parser.py movies_group.json") sys.exit() backup_filepath = sys.argv[1] with open(backup_filepath, "r", encoding="utf-8") as input_file: contents = input_file.read() jdata = json.loads(contents) if "chats" not in jdata: parse_telegram_to_csv(jdata) else: for chat in jdata["chats"]["list"]: parse_telegram_to_csv(chat)