{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.linear_model import LinearRegression \n",
    "\n",
    "pd.options.display.max_columns = 2000\n",
    "pd.options.display.max_rows = 2000\n",
    "\n",
    "def flatten_json(nested_json, exclude=['']):\n",
    "    out = {}\n",
    "\n",
    "    def flatten(x, name='', exclude=exclude):\n",
    "        if type(x) is dict:\n",
    "            for a in x:\n",
    "                if a not in exclude: flatten(x[a], name + a + '_')\n",
    "        elif type(x) is list:\n",
    "            i=0\n",
    "            for a in x:\n",
    "                flatten(a, name + str(i) + '_')\n",
    "                i += 1\n",
    "        else:\n",
    "            out[name[:-1]] = x\n",
    "\n",
    "    flatten(nested_json)\n",
    "    return out\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('result.json') as json_file:\n",
    "    data = json.load(json_file)\n",
    "\n",
    "data = pd.DataFrame([flatten_json(x) for x in data['messages']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "messages = data[['from', 'text','text_entities_45_text','text_entities_46_text','text_entities_47_text','text_entities_48_text']]\n",
    "words_freq = messages.replace(to_replace=[',','.','(',')'],value='')\n",
    "words_freq = words_freq.text.str.lower().str.split().explode().value_counts()\n",
    "words_freq = pd.DataFrame(words_freq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "words_freq.loc['1337']"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Messages cumulés sur la durée par membre."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hist_data = data[['date','from', 'type']]\n",
    "hist_data['date'] =  pd.to_datetime(hist_data['date'], yearfirst=True).dt.date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(data['from'].unique())\n",
    "btz_core = ['Nathan Spaeter', 'Luyzon', 'Clément Krebs', 't o', 'Leous', 'Arnaud']\n",
    "btz_ext = [*btz_core, *['Senkei', 'Éléonore', 'Tozpa', 'Léo', 'XxX_MatthieuXPlume_XxX', 'poline', 'Sarah Guillemant']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_scores = {}\n",
    "for name in btz_core:\n",
    "    idx = hist_data['from']==name\n",
    "    time_range = pd.to_datetime(hist_data[idx]['date'], yearfirst=True).dt.date\n",
    "    cumulative_count = np.arange(time_range.shape[0])\n",
    "    plt.plot(time_range, cumulative_count, label=name)\n",
    "plt.legend()\n",
    "plt.show()\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hist_data.groupby(['date','from']).size()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9 (main, Dec 19 2022, 17:35:49) [GCC 12.2.0]"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}