| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194 |
- """
- file: telegram-chat-parser.py
- author: Artur Rodrigues Rocha Neto
- email: artur.rodrigues26@gmail.com
- github: https://github.com/keizerzilla
- created: 23/12/2020
- description: Script to parse a Telegram chat history JSON file into a tabular format (CSV).
- requirements: Python 3.x
- """
- import re
- import sys
- import csv
- import json
- from datetime import datetime
- columns = [
- "msg_id",
- "sender",
- "sender_id",
- "reply_to_msg_id",
- "date",
- "msg_type",
- "msg_text",
- "msg_content",
- "is_edited",
- "edit_date",
- "has_mention",
- "has_email",
- "has_phone",
- "has_hashtag",
- "is_forwarded",
- "is_bot_command",
- ]
- file_types = [
- "animation",
- "video_file",
- "video_message",
- "voice_message",
- "audio_file",
- ]
- mention_types = [
- "mention",
- "mention_name",
- ]
- null_name_counter = 0
- def parse_telegram_to_csv(jdata):
-
- if jdata.get("name") is None:
- global null_name_counter
- null_name_counter += 1
- chat_name = f"UnnamedChat-{null_name_counter}"
- else:
- chat_name = re.sub(r'[\W_]+', u'', jdata.get("name"), flags=re.UNICODE)
- output_filepath = f"{chat_name}.csv"
-
- with open(output_filepath, "w", encoding="utf-8") as output_file:
- writer = csv.DictWriter(output_file, columns, dialect="unix", quoting=csv.QUOTE_NONNUMERIC)
- writer.writeheader()
-
- for message in jdata["messages"]:
- if message["type"] != "message":
- continue
-
- msg_id = message["id"]
- sender = message["from"]
- sender_id = message["from_id"]
- reply_to_msg_id = message["reply_to_message_id"] if "reply_to_message_id" in message else -1
- date = message["date"].replace("T", " ")
- dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
-
- msg_content = message["text"]
- msg_text = message["text"]
- msg_type = "text"
-
- if "media_type" in message:
- msg_type = message["media_type"]
- if message["media_type"] == "sticker":
- if "sticker_emoji" in message:
- msg_content = message["file"]
- else:
- msg_content = "?"
- elif message["media_type"] in file_types:
- msg_content = message["file"]
- elif "file" in message:
- msg_type = "file"
- msg_content = message["file"]
-
- if "photo" in message:
- msg_type = "photo"
- msg_content = message["photo"]
- elif "poll" in message:
- msg_type = "poll"
- msg_content = str(message["poll"]["total_voters"])
- elif "location_information" in message:
- msg_type = "location"
- loc = message["location_information"]
- msg_content = str(loc["latitude"]) + "," + str(loc["longitude"])
-
- has_mention = 0
- has_email = 0
- has_phone = 0
- has_hashtag = 0
- is_bot_command = 0
-
- if type(msg_content) == list:
- txt_content = ""
- for part in msg_content:
- if type(part) == str:
- txt_content += part
- elif type(part) == dict:
- if part["type"] == "link":
- msg_type = "link"
- elif part["type"] in mention_types:
- has_mention = 1
- elif part["type"] == "email":
- has_email = 1
- elif part["type"] == "phone":
- has_phone = 1
- elif part["type"] == "hashtag":
- has_hashtag = 1
- elif part["type"] == "bot_command":
- is_bot_command = 1
-
- txt_content += part["text"]
- msg_content = txt_content
- msg_text = txt_content
-
- msg_content = msg_content.replace("\n", " ")
-
- is_edited = 0
- edit_date = 0
- if "edited" in message:
- edit_date = message["edited"].replace("T", " ")
- dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
- is_edited = 1
- is_forwarded = 0
- if "is_forwarded" in message:
- is_forwarded = 1
- row = {
- "msg_id" : msg_id,
- "sender" : sender,
- "sender_id" : sender_id,
- "reply_to_msg_id": reply_to_msg_id,
- "date" : date,
- "msg_type" : msg_type,
- "msg_text" : msg_text,
- "msg_content" : msg_content,
- "is_edited" : is_edited,
- "edit_date" : edit_date,
- "has_mention" : has_mention,
- "has_email" : has_email,
- "has_phone" : has_phone,
- "has_hashtag" : has_hashtag,
- "is_forwarded": is_forwarded,
- "is_bot_command" : is_bot_command,
- }
- writer.writerow(row)
- print(chat_name, "OK!")
- if __name__ == "__main__":
- if len(sys.argv) != 2:
- print("ERROR: incorrect number of arguments!")
- print("How to use it:")
- print(" python3 telegram-chat-parser.py <chat_history_json>")
- print("Example:")
- print(" python3 telegram-chat-parser.py movies_group.json")
- sys.exit()
- backup_filepath = sys.argv[1]
-
- with open(backup_filepath, "r", encoding="utf-8") as input_file:
- contents = input_file.read()
- jdata = json.loads(contents)
-
- if "chats" not in jdata:
- parse_telegram_to_csv(jdata)
- else:
- for chat in jdata["chats"]["list"]:
- parse_telegram_to_csv(chat)
-
|