2 năm trước cách đây · bdfb4c3fec
--- a/.gitignore
+++ b/.gitignore
@@ -59,4 +59,4 @@ docs/_build/
 
				 target/
			
 
				 
			
 
				 # Privacy
			
 
				-result.json
			
 
				+result*.json
			
--- a/recap.ipynb
+++ b/recap.ipynb
@@ -7,31 +7,8 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "import pandas as pd\n",
			
 
				-    "import numpy as np\n",
			
 
				-    "import json\n",
			
 
				     "import matplotlib.pyplot as plt\n",
			
 
				-    "from sklearn.linear_model import LinearRegression \n",
			
 
				-    "\n",
			
 
				-    "pd.options.display.max_columns = 2000\n",
			
 
				-    "pd.options.display.max_rows = 2000\n",
			
 
				-    "\n",
			
 
				-    "def flatten_json(nested_json, exclude=['']):\n",
			
 
				-    "    out = {}\n",
			
 
				-    "\n",
			
 
				-    "    def flatten(x, name='', exclude=exclude):\n",
			
 
				-    "        if type(x) is dict:\n",
			
 
				-    "            for a in x:\n",
			
 
				-    "                if a not in exclude: flatten(x[a], name + a + '_')\n",
			
 
				-    "        elif type(x) is list:\n",
			
 
				-    "            i=0\n",
			
 
				-    "            for a in x:\n",
			
 
				-    "                flatten(a, name + str(i) + '_')\n",
			
 
				-    "                i += 1\n",
			
 
				-    "        else:\n",
			
 
				-    "            out[name[:-1]] = x\n",
			
 
				-    "\n",
			
 
				-    "    flatten(nested_json)\n",
			
 
				-    "    return out\n"
			
 
				+    "import numpy as np"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -40,10 +17,9 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "with open('result.json') as json_file:\n",
			
 
				-    "    data = json.load(json_file)\n",
			
 
				-    "\n",
			
 
				-    "data = pd.DataFrame([flatten_json(x) for x in data['messages']])"
			
 
				+    "# You need to use the parser before to get the csv file.\n",
			
 
				+    "my_data = pd.read_csv('bretzel.csv', index_col='date', dtype=str)\n",
			
 
				+    "my_data = my_data.replace(to_replace=[np.NaN], value='')"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -52,10 +28,14 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "messages = data[['from', 'text','text_entities_45_text','text_entities_46_text','text_entities_47_text','text_entities_48_text']]\n",
			
 
				-    "words_freq = messages.replace(to_replace=[',','.','(',')'],value='')\n",
			
 
				-    "words_freq = words_freq.text.str.lower().str.split().explode().value_counts()\n",
			
 
				-    "words_freq = pd.DataFrame(words_freq)"
			
 
				+    "btz_core_id = {\n",
			
 
				+    "            'Nathan':   'user502298447',\n",
			
 
				+    "            'Louison':  'user435924815',\n",
			
 
				+    "            'Clément':  'user451690413',\n",
			
 
				+    "            'Théo':     'user490556008',\n",
			
 
				+    "            'Léopold':  'user483543500',\n",
			
 
				+    "            'Arnaud':   'user469821944',\n",
			
 
				+    "            }"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -64,25 +44,17 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "words_freq.loc['1337']"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "attachments": {},
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Messages cumulés sur la durée par membre."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "hist_data = data[['date','from', 'type']]\n",
			
 
				-    "hist_data['date'] =  pd.to_datetime(hist_data['date'], yearfirst=True).dt.date"
			
 
				+    "def words_frequency(data):\n",
			
 
				+    "    words_freq = data['msg_text']\n",
			
 
				+    "    words_freq = words_freq.astype(str)\n",
			
 
				+    "    remove_list = [',', '.', '(', ')', '{', '}', '[', ']', '-']\n",
			
 
				+    "    for char in remove_list:\n",
			
 
				+    "        words_freq = words_freq.str.replace(char, '')\n",
			
 
				+    "\n",
			
 
				+    "    # Replace apostrophes with spaces for word counting purposes\n",
			
 
				+    "    words_freq = words_freq.astype(str).str.replace('\\'', ' ')\n",
			
 
				+    "    words_freq = words_freq.str.lower().str.split(expand=True).stack().value_counts()\n",
			
 
				+    "    return words_freq"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -91,9 +63,16 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "print(data['from'].unique())\n",
			
 
				-    "btz_core = ['Nathan Spaeter', 'Luyzon', 'Clément Krebs', 't o', 'Leous', 'Arnaud']\n",
			
 
				-    "btz_ext = [*btz_core, *['Senkei', 'Éléonore', 'Tozpa', 'Léo', 'XxX_MatthieuXPlume_XxX', 'poline', 'Sarah Guillemant']]"
			
 
				+    "def plot_message_count(data):\n",
			
 
				+    "    for name in btz_core_id:\n",
			
 
				+    "        idx = data['sender_id']==btz_core_id.get(name)\n",
			
 
				+    "        time_range = pd.to_datetime(data.index[idx], yearfirst=True).to_series().dt.date\n",
			
 
				+    "        cumulative_count = 1+np.arange(time_range.shape[0])\n",
			
 
				+    "        plt.plot(time_range, cumulative_count, label=name)\n",
			
 
				+    "    plt.legend()\n",
			
 
				+    "    plt.xlabel('Date')\n",
			
 
				+    "    plt.ylabel('Nombre de messages')\n",
			
 
				+    "    plt.show()"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -102,15 +81,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "list_scores = {}\n",
			
 
				-    "for name in btz_core:\n",
			
 
				-    "    idx = hist_data['from']==name\n",
			
 
				-    "    time_range = pd.to_datetime(hist_data[idx]['date'], yearfirst=True).dt.date\n",
			
 
				-    "    cumulative_count = np.arange(time_range.shape[0])\n",
			
 
				-    "    plt.plot(time_range, cumulative_count, label=name)\n",
			
 
				-    "plt.legend()\n",
			
 
				-    "plt.show()\n",
			
 
				-    "    "
			
 
				+    "wf = words_frequency(my_data)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -119,7 +90,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "hist_data.groupby(['date','from']).size()"
			
 
				+    "plot_message_count(my_data)"
			
 
				    ]
			
 
				   }
			
 
				  ],
			
@@ -139,7 +110,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.9 (main, Dec 19 2022, 17:35:49) [GCC 12.2.0]"
			
 
				+   "version": "3.11.3"
			
 
				   },
			
 
				   "orig_nbformat": 4,
			
 
				   "vscode": {