|
|
@@ -69,6 +69,7 @@
|
|
|
" time_range = pd.to_datetime(data.index[idx], yearfirst=True).to_series().dt.date\n",
|
|
|
" cumulative_count = 1+np.arange(time_range.shape[0])\n",
|
|
|
" plt.plot(time_range, cumulative_count, label=name)\n",
|
|
|
+ " \n",
|
|
|
" plt.legend()\n",
|
|
|
" plt.xlabel('Date')\n",
|
|
|
" plt.ylabel('Nombre de messages')\n",
|
|
|
@@ -81,7 +82,54 @@
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "wf = words_frequency(my_data)"
|
|
|
+ "def message_length(data, mode='moving', freq='365D'):\n",
|
|
|
+ " text_message = data[['sender', 'sender_id', 'msg_text']]\n",
|
|
|
+ " text_message.sort_index(inplace=True)\n",
|
|
|
+ " for name in btz_core_id:\n",
|
|
|
+ " idx = text_message['sender_id']==btz_core_id.get(name)\n",
|
|
|
+ " time_range = pd.to_datetime(data.index[idx], yearfirst=True).to_series().dt.date\n",
|
|
|
+ " user_messages = text_message[idx]\n",
|
|
|
+ " user_messages_length = user_messages['msg_text'].str.len()\n",
|
|
|
+ " user_messages_length_no_spaces = user_messages['msg_text'].str.replace(' ', '').str.len()\n",
|
|
|
+ " print(f\"{name}, {user_messages_length.mean():.2f}, {user_messages_length_no_spaces.mean():.2f}\")\n",
|
|
|
+ " user_messages_length.index = pd.to_datetime(user_messages_length.index)\n",
|
|
|
+ "\n",
|
|
|
+ " if mode == 'moving':\n",
|
|
|
+ " plt.plot(time_range, user_messages_length.rolling(freq).mean(), label=name)\n",
|
|
|
+ " if mode == 'cum':\n",
|
|
|
+ " plt.plot(time_range, user_messages_length.expanding(1).sum(), label=name) \n",
|
|
|
+ " \n",
|
|
|
+ " plt.legend()\n",
|
|
|
+ " plt.xlabel('Date')\n",
|
|
|
+ " plt.ylabel('Longueur (caractères)')\n",
|
|
|
+ " if mode == 'moving':\n",
|
|
|
+ " plt.title(f\"Moyenne glissante de fréquence {freq} du nombre de caractères par message\")\n",
|
|
|
+ " if mode == 'cum':\n",
|
|
|
+ " plt.title(f\"Somme cumulée du nombre de caractères\")\n",
|
|
|
+ " plt.show()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "message_length(my_data, mode='moving', freq='365D')"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "def most_messages(data):\n",
|
|
|
+ " data.index = pd.to_datetime(data.index, yearfirst=True)\n",
|
|
|
+ " messages = data['msg_text']\n",
|
|
|
+ " top = messages.groupby(messages.index.date).count().sort_values(ascending=False)\n",
|
|
|
+ " top.index = pd.to_datetime(top.index, format='%Y-%m-%d').strftime('%d/%m/%Y')\n",
|
|
|
+ " return top"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
@@ -90,7 +138,7 @@
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "plot_message_count(my_data)"
|
|
|
+ "most_messages(my_data)[:30]"
|
|
|
]
|
|
|
}
|
|
|
],
|