|
|
@@ -0,0 +1,108 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 7,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import pandas as pd\n",
|
|
|
+ "import numpy as np\n",
|
|
|
+ "import json\n",
|
|
|
+ "\n",
|
|
|
+ "pd.options.display.max_columns = 2000\n",
|
|
|
+ "pd.options.display.max_rows = 2000\n",
|
|
|
+ "\n",
|
|
|
+ "def flatten_json(nested_json, exclude=['']):\n",
|
|
|
+ " out = {}\n",
|
|
|
+ "\n",
|
|
|
+ " def flatten(x, name='', exclude=exclude):\n",
|
|
|
+ " if type(x) is dict:\n",
|
|
|
+ " for a in x:\n",
|
|
|
+ " if a not in exclude: flatten(x[a], name + a + '_')\n",
|
|
|
+ " elif type(x) is list:\n",
|
|
|
+ " i=0\n",
|
|
|
+ " for a in x:\n",
|
|
|
+ " flatten(a, name + str(i) + '_')\n",
|
|
|
+ " i += 1\n",
|
|
|
+ " else:\n",
|
|
|
+ " out[name[:-1]] = x\n",
|
|
|
+ "\n",
|
|
|
+ " flatten(nested_json)\n",
|
|
|
+ " return out\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 2,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "with open('result.json') as json_file:\n",
|
|
|
+ " data = json.load(json_file)\n",
|
|
|
+ "\n",
|
|
|
+ "data = pd.DataFrame([flatten_json(x) for x in data['messages']])"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 11,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "messages = data[['from', 'text','text_entities_45_text','text_entities_46_text','text_entities_47_text','text_entities_48_text']]\n",
|
|
|
+ "words_freq = messages.replace(to_replace=[',','.','(',')'],value='')\n",
|
|
|
+ "words_freq = words_freq.text.str.lower().str.split().explode().value_counts()\n",
|
|
|
+ "words_freq = pd.DataFrame(words_freq)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 34,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "data": {
|
|
|
+ "text/plain": [
|
|
|
+ "text 220\n",
|
|
|
+ "Name: 1337, dtype: int64"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "execution_count": 34,
|
|
|
+ "metadata": {},
|
|
|
+ "output_type": "execute_result"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "words_freq.loc['1337']"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 3",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": "3.10.9"
|
|
|
+ },
|
|
|
+ "orig_nbformat": 4,
|
|
|
+ "vscode": {
|
|
|
+ "interpreter": {
|
|
|
+ "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 2
|
|
|
+}
|