telegram-chat-parser.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. """
  2. file: telegram-chat-parser.py
  3. author: Artur Rodrigues Rocha Neto
  4. email: artur.rodrigues26@gmail.com
  5. github: https://github.com/keizerzilla
  6. created: 23/12/2020
  7. description: Script to parse a Telegram chat history JSON file into a tabular format (CSV).
  8. requirements: Python 3.x
  9. """
  10. import re
  11. import sys
  12. import csv
  13. import json
  14. from datetime import datetime
  15. columns = [
  16. "msg_id",
  17. "sender",
  18. "sender_id",
  19. "reply_to_msg_id",
  20. "date",
  21. "msg_type",
  22. "msg_text",
  23. "msg_content",
  24. "is_edited",
  25. "edit_date",
  26. "has_mention",
  27. "has_email",
  28. "has_phone",
  29. "has_hashtag",
  30. "is_forwarded",
  31. "is_bot_command",
  32. ]
  33. file_types = [
  34. "animation",
  35. "video_file",
  36. "video_message",
  37. "voice_message",
  38. "audio_file",
  39. ]
  40. mention_types = [
  41. "mention",
  42. "mention_name",
  43. ]
  44. null_name_counter = 0
  45. def parse_telegram_to_csv(jdata):
  46. if jdata.get("name") is None:
  47. global null_name_counter
  48. null_name_counter += 1
  49. chat_name = f"UnnamedChat-{null_name_counter}"
  50. else:
  51. chat_name = re.sub(r'[\W_]+', u'', jdata.get("name"), flags=re.UNICODE)
  52. output_filepath = f"{chat_name}.csv"
  53. with open(output_filepath, "w", encoding="utf-8") as output_file:
  54. writer = csv.DictWriter(output_file, columns, dialect="unix", quoting=csv.QUOTE_NONNUMERIC)
  55. writer.writeheader()
  56. for message in jdata["messages"]:
  57. if message["type"] != "message":
  58. continue
  59. msg_id = message["id"]
  60. sender = message["from"]
  61. sender_id = message["from_id"]
  62. reply_to_msg_id = message["reply_to_message_id"] if "reply_to_message_id" in message else -1
  63. date = message["date"].replace("T", " ")
  64. dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
  65. msg_content = message["text"]
  66. msg_text = message["text"]
  67. msg_type = "text"
  68. if "media_type" in message:
  69. msg_type = message["media_type"]
  70. if message["media_type"] == "sticker":
  71. if "sticker_emoji" in message:
  72. msg_content = message["file"]
  73. else:
  74. msg_content = "?"
  75. elif message["media_type"] in file_types:
  76. msg_content = message["file"]
  77. elif "file" in message:
  78. msg_type = "file"
  79. msg_content = message["file"]
  80. if "photo" in message:
  81. msg_type = "photo"
  82. msg_content = message["photo"]
  83. elif "poll" in message:
  84. msg_type = "poll"
  85. msg_content = str(message["poll"]["total_voters"])
  86. elif "location_information" in message:
  87. msg_type = "location"
  88. loc = message["location_information"]
  89. msg_content = str(loc["latitude"]) + "," + str(loc["longitude"])
  90. has_mention = 0
  91. has_email = 0
  92. has_phone = 0
  93. has_hashtag = 0
  94. is_bot_command = 0
  95. if type(msg_content) == list:
  96. txt_content = ""
  97. for part in msg_content:
  98. if type(part) == str:
  99. txt_content += part
  100. elif type(part) == dict:
  101. if part["type"] == "link":
  102. msg_type = "link"
  103. elif part["type"] in mention_types:
  104. has_mention = 1
  105. elif part["type"] == "email":
  106. has_email = 1
  107. elif part["type"] == "phone":
  108. has_phone = 1
  109. elif part["type"] == "hashtag":
  110. has_hashtag = 1
  111. elif part["type"] == "bot_command":
  112. is_bot_command = 1
  113. txt_content += part["text"]
  114. msg_content = txt_content
  115. msg_text = txt_content
  116. msg_content = msg_content.replace("\n", " ")
  117. is_edited = 0
  118. edit_date = 0
  119. if "edited" in message:
  120. edit_date = message["edited"].replace("T", " ")
  121. dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
  122. is_edited = 1
  123. is_forwarded = 0
  124. if "is_forwarded" in message:
  125. is_forwarded = 1
  126. row = {
  127. "msg_id" : msg_id,
  128. "sender" : sender,
  129. "sender_id" : sender_id,
  130. "reply_to_msg_id": reply_to_msg_id,
  131. "date" : date,
  132. "msg_type" : msg_type,
  133. "msg_text" : msg_text,
  134. "msg_content" : msg_content,
  135. "is_edited" : is_edited,
  136. "edit_date" : edit_date,
  137. "has_mention" : has_mention,
  138. "has_email" : has_email,
  139. "has_phone" : has_phone,
  140. "has_hashtag" : has_hashtag,
  141. "is_forwarded": is_forwarded,
  142. "is_bot_command" : is_bot_command,
  143. }
  144. writer.writerow(row)
  145. print(chat_name, "OK!")
  146. if __name__ == "__main__":
  147. if len(sys.argv) != 2:
  148. print("ERROR: incorrect number of arguments!")
  149. print("How to use it:")
  150. print(" python3 telegram-chat-parser.py <chat_history_json>")
  151. print("Example:")
  152. print(" python3 telegram-chat-parser.py movies_group.json")
  153. sys.exit()
  154. backup_filepath = sys.argv[1]
  155. with open(backup_filepath, "r", encoding="utf-8") as input_file:
  156. contents = input_file.read()
  157. jdata = json.loads(contents)
  158. if "chats" not in jdata:
  159. parse_telegram_to_csv(jdata)
  160. else:
  161. for chat in jdata["chats"]["list"]:
  162. parse_telegram_to_csv(chat)