浏览代码

top messages

arnaud 2 年之前
父节点
当前提交
6eca21442e
共有 1 个文件被更改,包括 50 次插入2 次删除
  1. 50 2
      recap.ipynb

+ 50 - 2
recap.ipynb

@@ -69,6 +69,7 @@
     "        time_range = pd.to_datetime(data.index[idx], yearfirst=True).to_series().dt.date\n",
     "        cumulative_count = 1+np.arange(time_range.shape[0])\n",
     "        plt.plot(time_range, cumulative_count, label=name)\n",
+    "        \n",
     "    plt.legend()\n",
     "    plt.xlabel('Date')\n",
     "    plt.ylabel('Nombre de messages')\n",
@@ -81,7 +82,54 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "wf = words_frequency(my_data)"
+    "def message_length(data, mode='moving', freq='365D'):\n",
+    "    text_message = data[['sender', 'sender_id', 'msg_text']]\n",
+    "    text_message.sort_index(inplace=True)\n",
+    "    for name in btz_core_id:\n",
+    "        idx = text_message['sender_id']==btz_core_id.get(name)\n",
+    "        time_range = pd.to_datetime(data.index[idx], yearfirst=True).to_series().dt.date\n",
+    "        user_messages = text_message[idx]\n",
+    "        user_messages_length = user_messages['msg_text'].str.len()\n",
+    "        user_messages_length_no_spaces = user_messages['msg_text'].str.replace(' ', '').str.len()\n",
+    "        print(f\"{name}, {user_messages_length.mean():.2f}, {user_messages_length_no_spaces.mean():.2f}\")\n",
+    "        user_messages_length.index = pd.to_datetime(user_messages_length.index)\n",
+    "\n",
+    "        if mode == 'moving':\n",
+    "            plt.plot(time_range, user_messages_length.rolling(freq).mean(), label=name)\n",
+    "        if mode == 'cum':\n",
+    "            plt.plot(time_range, user_messages_length.expanding(1).sum(), label=name) \n",
+    "            \n",
+    "    plt.legend()\n",
+    "    plt.xlabel('Date')\n",
+    "    plt.ylabel('Longueur (caractères)')\n",
+    "    if mode == 'moving':\n",
+    "        plt.title(f\"Moyenne glissante de fréquence {freq} du nombre de caractères par message\")\n",
+    "    if mode == 'cum':\n",
+    "        plt.title(f\"Somme cumulée du nombre de caractères\")\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "message_length(my_data, mode='moving', freq='365D')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def most_messages(data):\n",
+    "    data.index = pd.to_datetime(data.index, yearfirst=True)\n",
+    "    messages = data['msg_text']\n",
+    "    top = messages.groupby(messages.index.date).count().sort_values(ascending=False)\n",
+    "    top.index = pd.to_datetime(top.index, format='%Y-%m-%d').strftime('%d/%m/%Y')\n",
+    "    return top"
    ]
   },
   {
@@ -90,7 +138,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_message_count(my_data)"
+    "most_messages(my_data)[:30]"
    ]
   }
  ],