{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import json\n", "import matplotlib.pyplot as plt\n", "from sklearn.linear_model import LinearRegression \n", "\n", "pd.options.display.max_columns = 2000\n", "pd.options.display.max_rows = 2000\n", "\n", "def flatten_json(nested_json, exclude=['']):\n", " out = {}\n", "\n", " def flatten(x, name='', exclude=exclude):\n", " if type(x) is dict:\n", " for a in x:\n", " if a not in exclude: flatten(x[a], name + a + '_')\n", " elif type(x) is list:\n", " i=0\n", " for a in x:\n", " flatten(a, name + str(i) + '_')\n", " i += 1\n", " else:\n", " out[name[:-1]] = x\n", "\n", " flatten(nested_json)\n", " return out\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('result.json') as json_file:\n", " data = json.load(json_file)\n", "\n", "data = pd.DataFrame([flatten_json(x) for x in data['messages']])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "messages = data[['from', 'text','text_entities_45_text','text_entities_46_text','text_entities_47_text','text_entities_48_text']]\n", "words_freq = messages.replace(to_replace=[',','.','(',')'],value='')\n", "words_freq = words_freq.text.str.lower().str.split().explode().value_counts()\n", "words_freq = pd.DataFrame(words_freq)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "words_freq.loc['1337']" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Messages cumulés sur la durée par membre." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hist_data = data[['date','from', 'type']]\n", "hist_data['date'] = pd.to_datetime(hist_data['date'], yearfirst=True).dt.date" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(data['from'].unique())\n", "btz_core = ['Nathan Spaeter', 'Luyzon', 'Clément Krebs', 't o', 'Leous', 'Arnaud']\n", "btz_ext = [*btz_core, *['Senkei', 'Éléonore', 'Tozpa', 'Léo', 'XxX_MatthieuXPlume_XxX', 'poline', 'Sarah Guillemant']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "list_scores = {}\n", "for name in btz_core:\n", " idx = hist_data['from']==name\n", " time_range = pd.to_datetime(hist_data[idx]['date'], yearfirst=True).dt.date\n", " cumulative_count = np.arange(time_range.shape[0])\n", " plt.plot(time_range, cumulative_count, label=name)\n", "plt.legend()\n", "plt.show()\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hist_data.groupby(['date','from']).size()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9 (main, Dec 19 2022, 17:35:49) [GCC 12.2.0]" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } } }, "nbformat": 4, "nbformat_minor": 2 }