In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [None]:
# Access paths
DATA_PATH = './data'
btz_path = os.path.join(DATA_PATH, 'bretzel.csv')

# You need to use the parser before to get the csv file.
my_data = pd.read_csv(btz_path, index_col='date', dtype=str)
my_data = my_data.replace(to_replace=[np.NaN], value='')

In [None]:
btz_core_id = {
 'Nathan': 'user502298447',
 'Louison': 'user435924815',
 'Clément': 'user451690413',
 'Théo': 'user490556008',
 'Léopold': 'user483543500',
 'Arnaud': 'user469821944',
 }

In [None]:
def words_frequency(data):
 words_freq = data['msg_text']
 words_freq = words_freq.astype(str)
 remove_list = [',', '.', '(', ')', '{', '}', '[', ']', '-']
 for char in remove_list:
 words_freq = words_freq.str.replace(char, '')

 # Replace apostrophes with spaces for word counting purposes
 words_freq = words_freq.astype(str).str.replace('\'', ' ')
 words_freq = words_freq.str.lower().str.split(expand=True).stack().value_counts()
 return words_freq

In [None]:
def plot_message_count(data):
 for name in btz_core_id:
 idx = data['sender_id']==btz_core_id.get(name)
 time_range = pd.to_datetime(data.index[idx], yearfirst=True).to_series().dt.date
 cumulative_count = 1+np.arange(time_range.shape[0])
 plt.plot(time_range, cumulative_count, label=name)
 
 plt.legend()
 plt.xlabel('Date')
 plt.ylabel('Nombre de messages')
 plt.show()

In [None]:
def message_length(data, mode='moving', freq='365D'):
 text_message = data[['sender', 'sender_id', 'msg_text']]
 text_message = text_message.sort_index()
 for name in btz_core_id:
 idx = text_message['sender_id'] == btz_core_id.get(name)
 time_range = pd.to_datetime(data.index[idx], yearfirst=True).to_series().dt.date
 user_messages = text_message.loc[idx]
 user_messages_length = user_messages['msg_text'].str.len()
 user_messages_length_no_spaces = user_messages['msg_text'].str.replace(' ', '').str.len()
 print(f"{name}, {user_messages_length.mean():.2f}, {user_messages_length_no_spaces.mean():.2f}")
 user_messages_length.index = pd.to_datetime(user_messages_length.index)

 if mode == 'moving':
 plt.plot(time_range, user_messages_length.rolling(freq).mean(), label=name)
 if mode == 'cum':
 plt.plot(time_range, user_messages_length.expanding(1).sum(), label=name) 
 
 plt.legend()
 plt.xlabel('Date')
 plt.ylabel('Longueur (caractères)')
 if mode == 'moving':
 plt.title(f"Moyenne glissante de fréquence {freq} du nombre de caractères par message")
 if mode == 'cum':
 plt.title(f"Somme cumulée du nombre de caractères")
 plt.show()

In [None]:
message_length(my_data, mode='moving', freq='365D')

In [None]:
def most_messages(data):
 data.index = pd.to_datetime(data.index, yearfirst=True)
 messages = data['msg_text']
 top = messages.groupby(messages.index.date).count().sort_values(ascending=False)
 top.index = pd.to_datetime(top.index, format='%Y-%m-%d').strftime('%d/%m/%Y')
 return top

In [None]:
most_messages(my_data)[:30]