|
|
@@ -7,31 +7,8 @@
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"import pandas as pd\n",
|
|
|
- "import numpy as np\n",
|
|
|
- "import json\n",
|
|
|
"import matplotlib.pyplot as plt\n",
|
|
|
- "from sklearn.linear_model import LinearRegression \n",
|
|
|
- "\n",
|
|
|
- "pd.options.display.max_columns = 2000\n",
|
|
|
- "pd.options.display.max_rows = 2000\n",
|
|
|
- "\n",
|
|
|
- "def flatten_json(nested_json, exclude=['']):\n",
|
|
|
- " out = {}\n",
|
|
|
- "\n",
|
|
|
- " def flatten(x, name='', exclude=exclude):\n",
|
|
|
- " if type(x) is dict:\n",
|
|
|
- " for a in x:\n",
|
|
|
- " if a not in exclude: flatten(x[a], name + a + '_')\n",
|
|
|
- " elif type(x) is list:\n",
|
|
|
- " i=0\n",
|
|
|
- " for a in x:\n",
|
|
|
- " flatten(a, name + str(i) + '_')\n",
|
|
|
- " i += 1\n",
|
|
|
- " else:\n",
|
|
|
- " out[name[:-1]] = x\n",
|
|
|
- "\n",
|
|
|
- " flatten(nested_json)\n",
|
|
|
- " return out\n"
|
|
|
+ "import numpy as np"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
@@ -40,10 +17,9 @@
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "with open('result.json') as json_file:\n",
|
|
|
- " data = json.load(json_file)\n",
|
|
|
- "\n",
|
|
|
- "data = pd.DataFrame([flatten_json(x) for x in data['messages']])"
|
|
|
+ "# You need to use the parser before to get the csv file.\n",
|
|
|
+ "my_data = pd.read_csv('bretzel.csv', index_col='date', dtype=str)\n",
|
|
|
+ "my_data = my_data.replace(to_replace=[np.NaN], value='')"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
@@ -52,10 +28,14 @@
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "messages = data[['from', 'text','text_entities_45_text','text_entities_46_text','text_entities_47_text','text_entities_48_text']]\n",
|
|
|
- "words_freq = messages.replace(to_replace=[',','.','(',')'],value='')\n",
|
|
|
- "words_freq = words_freq.text.str.lower().str.split().explode().value_counts()\n",
|
|
|
- "words_freq = pd.DataFrame(words_freq)"
|
|
|
+ "btz_core_id = {\n",
|
|
|
+ " 'Nathan': 'user502298447',\n",
|
|
|
+ " 'Louison': 'user435924815',\n",
|
|
|
+ " 'Clément': 'user451690413',\n",
|
|
|
+ " 'Théo': 'user490556008',\n",
|
|
|
+ " 'Léopold': 'user483543500',\n",
|
|
|
+ " 'Arnaud': 'user469821944',\n",
|
|
|
+ " }"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
@@ -64,25 +44,17 @@
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "words_freq.loc['1337']"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "attachments": {},
|
|
|
- "cell_type": "markdown",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "Messages cumulés sur la durée par membre."
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": null,
|
|
|
- "metadata": {},
|
|
|
- "outputs": [],
|
|
|
- "source": [
|
|
|
- "hist_data = data[['date','from', 'type']]\n",
|
|
|
- "hist_data['date'] = pd.to_datetime(hist_data['date'], yearfirst=True).dt.date"
|
|
|
+ "def words_frequency(data):\n",
|
|
|
+ " words_freq = data['msg_text']\n",
|
|
|
+ " words_freq = words_freq.astype(str)\n",
|
|
|
+ " remove_list = [',', '.', '(', ')', '{', '}', '[', ']', '-']\n",
|
|
|
+ " for char in remove_list:\n",
|
|
|
+ " words_freq = words_freq.str.replace(char, '')\n",
|
|
|
+ "\n",
|
|
|
+ " # Replace apostrophes with spaces for word counting purposes\n",
|
|
|
+ " words_freq = words_freq.astype(str).str.replace('\\'', ' ')\n",
|
|
|
+ " words_freq = words_freq.str.lower().str.split(expand=True).stack().value_counts()\n",
|
|
|
+ " return words_freq"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
@@ -91,9 +63,16 @@
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "print(data['from'].unique())\n",
|
|
|
- "btz_core = ['Nathan Spaeter', 'Luyzon', 'Clément Krebs', 't o', 'Leous', 'Arnaud']\n",
|
|
|
- "btz_ext = [*btz_core, *['Senkei', 'Éléonore', 'Tozpa', 'Léo', 'XxX_MatthieuXPlume_XxX', 'poline', 'Sarah Guillemant']]"
|
|
|
+ "def plot_message_count(data):\n",
|
|
|
+ " for name in btz_core_id:\n",
|
|
|
+ " idx = data['sender_id']==btz_core_id.get(name)\n",
|
|
|
+ " time_range = pd.to_datetime(data.index[idx], yearfirst=True).to_series().dt.date\n",
|
|
|
+ " cumulative_count = 1+np.arange(time_range.shape[0])\n",
|
|
|
+ " plt.plot(time_range, cumulative_count, label=name)\n",
|
|
|
+ " plt.legend()\n",
|
|
|
+ " plt.xlabel('Date')\n",
|
|
|
+ " plt.ylabel('Nombre de messages')\n",
|
|
|
+ " plt.show()"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
@@ -102,15 +81,7 @@
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "list_scores = {}\n",
|
|
|
- "for name in btz_core:\n",
|
|
|
- " idx = hist_data['from']==name\n",
|
|
|
- " time_range = pd.to_datetime(hist_data[idx]['date'], yearfirst=True).dt.date\n",
|
|
|
- " cumulative_count = np.arange(time_range.shape[0])\n",
|
|
|
- " plt.plot(time_range, cumulative_count, label=name)\n",
|
|
|
- "plt.legend()\n",
|
|
|
- "plt.show()\n",
|
|
|
- " "
|
|
|
+ "wf = words_frequency(my_data)"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
@@ -119,7 +90,7 @@
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "hist_data.groupby(['date','from']).size()"
|
|
|
+ "plot_message_count(my_data)"
|
|
|
]
|
|
|
}
|
|
|
],
|
|
|
@@ -139,7 +110,7 @@
|
|
|
"name": "python",
|
|
|
"nbconvert_exporter": "python",
|
|
|
"pygments_lexer": "ipython3",
|
|
|
- "version": "3.10.9 (main, Dec 19 2022, 17:35:49) [GCC 12.2.0]"
|
|
|
+ "version": "3.11.3"
|
|
|
},
|
|
|
"orig_nbformat": 4,
|
|
|
"vscode": {
|