3 years ago · 9faf0d057f
--- a/.gitignore
+++ b/.gitignore
@@ -58,3 +58,5 @@ docs/_build/
 
				 # PyBuilder
			
 
				 target/
			
 
				 
			
 
				+# Privacy
			
 
				+result.json
			
--- a/recap.ipynb
+++ b/recap.ipynb
@@ -0,0 +1,108 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 7,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import pandas as pd\n",
			
 
				+    "import numpy as np\n",
			
 
				+    "import json\n",
			
 
				+    "\n",
			
 
				+    "pd.options.display.max_columns = 2000\n",
			
 
				+    "pd.options.display.max_rows = 2000\n",
			
 
				+    "\n",
			
 
				+    "def flatten_json(nested_json, exclude=['']):\n",
			
 
				+    "    out = {}\n",
			
 
				+    "\n",
			
 
				+    "    def flatten(x, name='', exclude=exclude):\n",
			
 
				+    "        if type(x) is dict:\n",
			
 
				+    "            for a in x:\n",
			
 
				+    "                if a not in exclude: flatten(x[a], name + a + '_')\n",
			
 
				+    "        elif type(x) is list:\n",
			
 
				+    "            i=0\n",
			
 
				+    "            for a in x:\n",
			
 
				+    "                flatten(a, name + str(i) + '_')\n",
			
 
				+    "                i += 1\n",
			
 
				+    "        else:\n",
			
 
				+    "            out[name[:-1]] = x\n",
			
 
				+    "\n",
			
 
				+    "    flatten(nested_json)\n",
			
 
				+    "    return out\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 2,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "with open('result.json') as json_file:\n",
			
 
				+    "    data = json.load(json_file)\n",
			
 
				+    "\n",
			
 
				+    "data = pd.DataFrame([flatten_json(x) for x in data['messages']])"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 11,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "messages = data[['from', 'text','text_entities_45_text','text_entities_46_text','text_entities_47_text','text_entities_48_text']]\n",
			
 
				+    "words_freq = messages.replace(to_replace=[',','.','(',')'],value='')\n",
			
 
				+    "words_freq = words_freq.text.str.lower().str.split().explode().value_counts()\n",
			
 
				+    "words_freq = pd.DataFrame(words_freq)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 34,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "text    220\n",
			
 
				+       "Name: 1337, dtype: int64"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 34,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "words_freq.loc['1337']"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.10.9"
			
 
				+  },
			
 
				+  "orig_nbformat": 4,
			
 
				+  "vscode": {
			
 
				+   "interpreter": {
			
 
				+    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
			
 
				+   }
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 2
			
 
				+}