Browse Source

added data parser

arnaud 2 years ago
parent
commit
0a964a46ff
1 changed files with 194 additions and 0 deletions
  1. 194 0
      telegram-chat-parser.py

+ 194 - 0
telegram-chat-parser.py

@@ -0,0 +1,194 @@
+"""
+file:         telegram-chat-parser.py
+author:       Artur Rodrigues Rocha Neto
+email:        artur.rodrigues26@gmail.com
+github:       https://github.com/keizerzilla
+created:      23/12/2020
+description:  Script to parse a Telegram chat history JSON file into a tabular format (CSV).
+requirements: Python 3.x
+"""
+
+import re
+import sys
+import csv
+import json
+from datetime import datetime
+
+
+columns = [
+           "msg_id",
+           "sender",
+           "sender_id",
+           "reply_to_msg_id",
+           "date",
+           "msg_type",
+           "msg_text",
+           "msg_content",
+           "is_edited",
+           "edit_date",
+           "has_mention",
+           "has_email",
+           "has_phone",
+           "has_hashtag",
+           "is_forwarded",
+           "is_bot_command",
+          ]
+
+file_types = [
+              "animation",
+              "video_file",
+              "video_message",
+              "voice_message",
+              "audio_file",
+             ]
+
+mention_types = [
+                 "mention",
+                 "mention_name",
+                ]
+
+null_name_counter = 0
+
+def parse_telegram_to_csv(jdata):
+    
+    if jdata.get("name") is None:
+        global null_name_counter 
+        null_name_counter += 1
+        chat_name = f"UnnamedChat-{null_name_counter}"
+    else:
+        chat_name = re.sub(r'[\W_]+', u'', jdata.get("name"), flags=re.UNICODE)
+    output_filepath = f"{chat_name}.csv"
+
+    
+    with open(output_filepath, "w", encoding="utf-8") as output_file:
+        writer = csv.DictWriter(output_file, columns, dialect="unix", quoting=csv.QUOTE_NONNUMERIC)
+        writer.writeheader()
+        
+        for message in jdata["messages"]:
+            if message["type"] != "message":
+                continue
+            
+            msg_id = message["id"]
+            sender = message["from"]
+            sender_id = message["from_id"]
+            reply_to_msg_id = message["reply_to_message_id"] if "reply_to_message_id" in message else -1
+            date = message["date"].replace("T", " ")
+            dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
+            
+            msg_content = message["text"]
+            msg_text = message["text"]
+            msg_type = "text"
+            
+            if "media_type" in message:
+                msg_type = message["media_type"]
+                if message["media_type"] == "sticker":
+                    if "sticker_emoji" in message:
+                        msg_content = message["file"]
+                    else:
+                        msg_content = "?"
+                elif message["media_type"] in file_types:
+                    msg_content = message["file"]
+            elif "file" in message:
+                msg_type = "file"
+                msg_content = message["file"]
+            
+            if "photo" in message:
+                msg_type = "photo"
+                msg_content = message["photo"]
+            elif "poll" in message:
+                msg_type = "poll"
+                msg_content = str(message["poll"]["total_voters"])
+            elif "location_information" in message:
+                msg_type = "location"
+                loc = message["location_information"]
+                msg_content = str(loc["latitude"]) + "," + str(loc["longitude"])
+            
+            has_mention = 0
+            has_email = 0
+            has_phone = 0
+            has_hashtag = 0
+            is_bot_command = 0
+
+            
+            if type(msg_content) == list:
+                txt_content = ""
+                for part in msg_content:
+                    if type(part) == str:
+                        txt_content += part
+                    elif type(part) == dict:
+                        if part["type"] == "link":
+                            msg_type = "link"
+                        elif part["type"] in mention_types:
+                            has_mention = 1
+                        elif part["type"] == "email":
+                            has_email = 1
+                        elif part["type"] == "phone":
+                            has_phone = 1
+                        elif part["type"] == "hashtag":
+                            has_hashtag = 1
+                        elif part["type"] == "bot_command":
+                            is_bot_command = 1
+                        
+                        txt_content += part["text"]
+                msg_content = txt_content
+                msg_text = txt_content
+            
+            msg_content = msg_content.replace("\n", " ")
+            
+            is_edited = 0
+            edit_date = 0
+
+            if "edited" in message:
+                edit_date = message["edited"].replace("T", " ")
+                dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
+                is_edited = 1
+
+            is_forwarded = 0
+
+            if "is_forwarded" in message:
+                is_forwarded = 1
+
+            row = {
+                "msg_id"  :         msg_id,
+                "sender"  :         sender,
+                "sender_id"  :      sender_id,
+                "reply_to_msg_id":  reply_to_msg_id,
+                "date"  :           date,
+                "msg_type"  :       msg_type,
+                "msg_text"  :       msg_text,
+                "msg_content"  :    msg_content,
+                "is_edited"  :      is_edited,
+                "edit_date"  :      edit_date,
+                "has_mention"  :    has_mention,
+                "has_email"  :      has_email,
+                "has_phone"  :      has_phone,
+                "has_hashtag"  :    has_hashtag,
+                "is_forwarded":     is_forwarded,
+                "is_bot_command"  : is_bot_command,
+            }
+            writer.writerow(row)
+
+    print(chat_name, "OK!")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("ERROR: incorrect number of arguments!")
+        print("How to use it:")
+        print("    python3 telegram-chat-parser.py <chat_history_json>")
+        print("Example:")
+        print("    python3 telegram-chat-parser.py movies_group.json")
+        sys.exit()
+
+    backup_filepath = sys.argv[1]
+    
+    with open(backup_filepath, "r", encoding="utf-8") as input_file:
+        contents = input_file.read()
+        jdata = json.loads(contents)
+        
+        if "chats" not in jdata:
+            parse_telegram_to_csv(jdata)
+        else:
+            for chat in jdata["chats"]["list"]:
+                parse_telegram_to_csv(chat)
+