2 years ago · 0a964a46ff
--- a/telegram-chat-parser.py
+++ b/telegram-chat-parser.py
@@ -0,0 +1,194 @@
 
				+"""

			
 
				+file:         telegram-chat-parser.py

			
 
				+author:       Artur Rodrigues Rocha Neto

			
 
				+email:        artur.rodrigues26@gmail.com

			
 
				+github:       https://github.com/keizerzilla

			
 
				+created:      23/12/2020

			
 
				+description:  Script to parse a Telegram chat history JSON file into a tabular format (CSV).

			
 
				+requirements: Python 3.x

			
 
				+"""

			
 
				+

			
 
				+import re

			
 
				+import sys

			
 
				+import csv

			
 
				+import json

			
 
				+from datetime import datetime

			
 
				+

			
 
				+

			
 
				+columns = [

			
 
				+           "msg_id",

			
 
				+           "sender",

			
 
				+           "sender_id",

			
 
				+           "reply_to_msg_id",

			
 
				+           "date",

			
 
				+           "msg_type",

			
 
				+           "msg_text",

			
 
				+           "msg_content",

			
 
				+           "is_edited",

			
 
				+           "edit_date",

			
 
				+           "has_mention",

			
 
				+           "has_email",

			
 
				+           "has_phone",

			
 
				+           "has_hashtag",

			
 
				+           "is_forwarded",

			
 
				+           "is_bot_command",

			
 
				+          ]

			
 
				+

			
 
				+file_types = [

			
 
				+              "animation",

			
 
				+              "video_file",

			
 
				+              "video_message",

			
 
				+              "voice_message",

			
 
				+              "audio_file",

			
 
				+             ]

			
 
				+

			
 
				+mention_types = [

			
 
				+                 "mention",

			
 
				+                 "mention_name",

			
 
				+                ]

			
 
				+

			
 
				+null_name_counter = 0

			
 
				+

			
 
				+def parse_telegram_to_csv(jdata):

			
 
				+    

			
 
				+    if jdata.get("name") is None:

			
 
				+        global null_name_counter 

			
 
				+        null_name_counter += 1

			
 
				+        chat_name = f"UnnamedChat-{null_name_counter}"

			
 
				+    else:

			
 
				+        chat_name = re.sub(r'[\W_]+', u'', jdata.get("name"), flags=re.UNICODE)

			
 
				+    output_filepath = f"{chat_name}.csv"

			
 
				+

			
 
				+    

			
 
				+    with open(output_filepath, "w", encoding="utf-8") as output_file:

			
 
				+        writer = csv.DictWriter(output_file, columns, dialect="unix", quoting=csv.QUOTE_NONNUMERIC)

			
 
				+        writer.writeheader()

			
 
				+        

			
 
				+        for message in jdata["messages"]:

			
 
				+            if message["type"] != "message":

			
 
				+                continue

			
 
				+            

			
 
				+            msg_id = message["id"]

			
 
				+            sender = message["from"]

			
 
				+            sender_id = message["from_id"]

			
 
				+            reply_to_msg_id = message["reply_to_message_id"] if "reply_to_message_id" in message else -1

			
 
				+            date = message["date"].replace("T", " ")

			
 
				+            dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")

			
 
				+            

			
 
				+            msg_content = message["text"]

			
 
				+            msg_text = message["text"]

			
 
				+            msg_type = "text"

			
 
				+            

			
 
				+            if "media_type" in message:

			
 
				+                msg_type = message["media_type"]

			
 
				+                if message["media_type"] == "sticker":

			
 
				+                    if "sticker_emoji" in message:

			
 
				+                        msg_content = message["file"]

			
 
				+                    else:

			
 
				+                        msg_content = "?"

			
 
				+                elif message["media_type"] in file_types:

			
 
				+                    msg_content = message["file"]

			
 
				+            elif "file" in message:

			
 
				+                msg_type = "file"

			
 
				+                msg_content = message["file"]

			
 
				+            

			
 
				+            if "photo" in message:

			
 
				+                msg_type = "photo"

			
 
				+                msg_content = message["photo"]

			
 
				+            elif "poll" in message:

			
 
				+                msg_type = "poll"

			
 
				+                msg_content = str(message["poll"]["total_voters"])

			
 
				+            elif "location_information" in message:

			
 
				+                msg_type = "location"

			
 
				+                loc = message["location_information"]

			
 
				+                msg_content = str(loc["latitude"]) + "," + str(loc["longitude"])

			
 
				+            

			
 
				+            has_mention = 0

			
 
				+            has_email = 0

			
 
				+            has_phone = 0

			
 
				+            has_hashtag = 0

			
 
				+            is_bot_command = 0

			
 
				+

			
 
				+            

			
 
				+            if type(msg_content) == list:

			
 
				+                txt_content = ""

			
 
				+                for part in msg_content:

			
 
				+                    if type(part) == str:

			
 
				+                        txt_content += part

			
 
				+                    elif type(part) == dict:

			
 
				+                        if part["type"] == "link":

			
 
				+                            msg_type = "link"

			
 
				+                        elif part["type"] in mention_types:

			
 
				+                            has_mention = 1

			
 
				+                        elif part["type"] == "email":

			
 
				+                            has_email = 1

			
 
				+                        elif part["type"] == "phone":

			
 
				+                            has_phone = 1

			
 
				+                        elif part["type"] == "hashtag":

			
 
				+                            has_hashtag = 1

			
 
				+                        elif part["type"] == "bot_command":

			
 
				+                            is_bot_command = 1

			
 
				+                        

			
 
				+                        txt_content += part["text"]

			
 
				+                msg_content = txt_content

			
 
				+                msg_text = txt_content

			
 
				+            

			
 
				+            msg_content = msg_content.replace("\n", " ")

			
 
				+            

			
 
				+            is_edited = 0

			
 
				+            edit_date = 0

			
 
				+

			
 
				+            if "edited" in message:

			
 
				+                edit_date = message["edited"].replace("T", " ")

			
 
				+                dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")

			
 
				+                is_edited = 1

			
 
				+

			
 
				+            is_forwarded = 0

			
 
				+

			
 
				+            if "is_forwarded" in message:

			
 
				+                is_forwarded = 1

			
 
				+

			
 
				+            row = {

			
 
				+                "msg_id"  :         msg_id,

			
 
				+                "sender"  :         sender,

			
 
				+                "sender_id"  :      sender_id,

			
 
				+                "reply_to_msg_id":  reply_to_msg_id,

			
 
				+                "date"  :           date,

			
 
				+                "msg_type"  :       msg_type,

			
 
				+                "msg_text"  :       msg_text,

			
 
				+                "msg_content"  :    msg_content,

			
 
				+                "is_edited"  :      is_edited,

			
 
				+                "edit_date"  :      edit_date,

			
 
				+                "has_mention"  :    has_mention,

			
 
				+                "has_email"  :      has_email,

			
 
				+                "has_phone"  :      has_phone,

			
 
				+                "has_hashtag"  :    has_hashtag,

			
 
				+                "is_forwarded":     is_forwarded,

			
 
				+                "is_bot_command"  : is_bot_command,

			
 
				+            }

			
 
				+            writer.writerow(row)

			
 
				+

			
 
				+    print(chat_name, "OK!")

			
 
				+

			
 
				+

			
 
				+if __name__ == "__main__":

			
 
				+    if len(sys.argv) != 2:

			
 
				+        print("ERROR: incorrect number of arguments!")

			
 
				+        print("How to use it:")

			
 
				+        print("    python3 telegram-chat-parser.py <chat_history_json>")

			
 
				+        print("Example:")

			
 
				+        print("    python3 telegram-chat-parser.py movies_group.json")

			
 
				+        sys.exit()

			
 
				+

			
 
				+    backup_filepath = sys.argv[1]

			
 
				+    

			
 
				+    with open(backup_filepath, "r", encoding="utf-8") as input_file:

			
 
				+        contents = input_file.read()

			
 
				+        jdata = json.loads(contents)

			
 
				+        

			
 
				+        if "chats" not in jdata:

			
 
				+            parse_telegram_to_csv(jdata)

			
 
				+        else:

			
 
				+            for chat in jdata["chats"]["list"]:

			
 
				+                parse_telegram_to_csv(chat)

			
 
				+