ソースを参照

recap notebook

arnaud 2 年 前
コミット
9faf0d057f
2 ファイル変更110 行追加0 行削除
  1. 2 0
      .gitignore
  2. 108 0
      recap.ipynb

+ 2 - 0
.gitignore

@@ -58,3 +58,5 @@ docs/_build/
 # PyBuilder
 target/
 
+# Privacy
+result.json

+ 108 - 0
recap.ipynb

@@ -0,0 +1,108 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import json\n",
+    "\n",
+    "pd.options.display.max_columns = 2000\n",
+    "pd.options.display.max_rows = 2000\n",
+    "\n",
+    "def flatten_json(nested_json, exclude=['']):\n",
+    "    out = {}\n",
+    "\n",
+    "    def flatten(x, name='', exclude=exclude):\n",
+    "        if type(x) is dict:\n",
+    "            for a in x:\n",
+    "                if a not in exclude: flatten(x[a], name + a + '_')\n",
+    "        elif type(x) is list:\n",
+    "            i=0\n",
+    "            for a in x:\n",
+    "                flatten(a, name + str(i) + '_')\n",
+    "                i += 1\n",
+    "        else:\n",
+    "            out[name[:-1]] = x\n",
+    "\n",
+    "    flatten(nested_json)\n",
+    "    return out\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('result.json') as json_file:\n",
+    "    data = json.load(json_file)\n",
+    "\n",
+    "data = pd.DataFrame([flatten_json(x) for x in data['messages']])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = data[['from', 'text','text_entities_45_text','text_entities_46_text','text_entities_47_text','text_entities_48_text']]\n",
+    "words_freq = messages.replace(to_replace=[',','.','(',')'],value='')\n",
+    "words_freq = words_freq.text.str.lower().str.split().explode().value_counts()\n",
+    "words_freq = pd.DataFrame(words_freq)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "text    220\n",
+       "Name: 1337, dtype: int64"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "words_freq.loc['1337']"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}