{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "kRpf_3Pb-uxs" }, "outputs": [], "source": [ "# Google Colab で実行している場合、ユーザの Google Drive をマウント(ファイルが使えるように)する。\n", "import os\n", "import sys\n", "moduleList = sys.modules\n", "ENV_COLAB = False\n", "if 'google.colab' in moduleList:\n", " ENV_COLAB = True\n", " # print(\"google_colab\")\n", "if ENV_COLAB:\n", " from google.colab import drive\n", " drive.mount('/content/drive')\n", " gdpath = \"/content/drive/MyDrive/\"\n", "else:\n", " gdpath = \"\"\n" ] }, { "cell_type": "code", "source": [ "# Google Drive のマイドライブ直下にファイルをアップロードしてください。\n", "# gz形式で圧縮されたファイルをアップロードした場合は、以下のようなコマンドで解凍できます。\n", "# ! gunzip /content/drive/MyDrive/MLS-full-cell-export-2023-01-31T000000.csv.gz" ], "metadata": { "id": "pancto0zOrUK" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# FYI : ローカルなデバイスを経由するのが面倒だとお嘆きの方が好むCLIコマンド:\n", "# ! cd /content/drive/MyDrive/ ; wget https://d2koia3g127518.cloudfront.net/export/MLS-full-cell-export-2023-02-20T000000.csv.gz" ], "metadata": { "id": "mJjqTGiH2kjx" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# マイドライブ直下のcsvファイルを読み込んで、DataFrame形式にする (数十秒〜1分間かかる場合も)\n", "import pandas as pd\n", "df = pd.read_csv(gdpath + 'MLS-full-cell-export-2023-01-31T000000.csv')\n", "df" ], "metadata": { "id": "WmQzSjCFkSX2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 各列の要約統計量を表示\n", "df.describe(include='all')" ], "metadata": { "id": "6RczEdMEiT1o" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 各列の要約統計量を stat_val1 に格納して、表示\n", "stat_val1 = df.describe(include='all')\n", "stat_val1" ], "metadata": { "id": "fX_IfrqFBBfn" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 値を取り出す\n", "stat_val1.loc['mean']['mcc']" ], "metadata": { "id": "CHdBLsWNmH6Z" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 最頻値\n", "df_mode = df.mode()\n", "df_mode" ], "metadata": { "id": "-YHYnvTtkdHI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 値を取り出す\n", "df_mode.loc[0]['lon']" ], "metadata": { "id": "MYtnB4-VlsA_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 中央値\n", "df_median = df.median()\n", "df_median" ], "metadata": { "id": "BKb6dH2Hkpf2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 値を取り出す\n", "df_median['lon']" ], "metadata": { "id": "PmfthoFGnjCt" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "各列同士がどれくらい関係があるか、相関行列を計算してみます。" ], "metadata": { "id": "VXcv7Ucz7jb-" } }, { "cell_type": "code", "source": [ "# 相関行列を計算する\n", "corr_matrix = df.corr()\n", "corr_matrix" ], "metadata": { "id": "n0nejRB97j6J" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 相関行列のヒートマップ表示\n", "df.corr().style.background_gradient(cmap = \"bwr\", vmin = -1, vmax = 1)" ], "metadata": { "id": "S7hGrWT2Micu" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 相関行列のヒートマップ表示: seaborn を使う場合\n", "import seaborn\n", "import matplotlib.pyplot as plt\n", "seaborn.heatmap(corr_matrix,\n", " vmin=-1.0, vmax=1.0, center=0,\n", " annot=True, fmt='.1f',\n", " xticklabels=corr_matrix.columns.values,\n", " yticklabels=corr_matrix.columns.values\n", " )\n", "plt.show()" ], "metadata": { "id": "tJvxoRjoE0wq" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df2 = df.copy() # df2 = df と書くと複製にならない(参照渡しになってしまう)\n", "# いくつかの列を削除する\n", "df2.drop('changeable', axis=1, inplace=True)\n", "df2.drop('averageSignal', axis=1, inplace=True)\n", "# df2 = df.loc[:,['mcc','cell','lon','lat']]\n", "df_sub = df2[::1000] # 1/1000 で間引いたスライス\n", "pd.plotting.scatter_matrix(df_sub, figsize=(15, 15))\n", "plt.show()" ], "metadata": { "id": "Oat0jmAaSmQ2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df3 = df.loc[:,['lon','lat']]\n", "df3.plot.scatter(x=0, y=1, s=1, alpha=0.02, figsize=(11, 11))" ], "metadata": { "id": "ewmIf0byWHFJ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df4 = df.loc[:,['mcc','lat']]\n", "df4.plot.scatter(x=0, y=1, s=3, alpha=0.5, figsize=(11, 11))" ], "metadata": { "id": "TubWKgmUm6jV" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_map = df[::10000] # 1/10000 に間引く\n", "\n", "import folium\n", "map = folium.Map(location=[df_map['lat'].mean(), df_map['lon'].mean()], zoom_start=2, width = 1000, height = 500)\n", "\n", "count = 0\n", "# データフレーム df_map の各行に対して地図上のマーカーを追加する\n", "for index, row in df_map.iterrows():\n", " folium.Marker([row['lat'], row['lon']], popup = 'mcc:'+str(row['mcc'])+' '+str(row['area'])).add_to(map)\n", " count += 1\n", " if(count>1000): # 念の為。1000点以上だと動作困難になる。\n", " break\n", "print(count)\n", "\n", "map" ], "metadata": { "id": "58SoOxdDkVth" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# mcc の値が 440(日本の国番号)のだけ抜き出す\n", "dfjp = df[df['mcc']==440]\n", "dfjp" ], "metadata": { "id": "cmz_vSZGXio2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# さらに、created の時刻が 2022年12月のを抜き出す\n", "from datetime import datetime\n", "time_col = \"created\"\n", "start_date = datetime(2022, 12, 1)\n", "end_date = datetime(2022, 12, 31)\n", "\n", "start_unixtime = int(start_date.timestamp())\n", "end_unixtime = int(end_date.timestamp())\n", "\n", "dfjp_2022_12 = dfjp[(dfjp[time_col] >= start_unixtime) & (dfjp[time_col] <= end_unixtime)]\n", "dfjp_2022_12" ], "metadata": { "id": "md8FvTG1A0x3" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_map = dfjp_2022_12[1::20] # 間引き\n", "\n", "import folium\n", "map = folium.Map(location=[df_map['lat'].mean(), df_map['lon'].mean()], zoom_start=4, width = 1000, height = 500)\n", "\n", "count = 0\n", "for index, row in df_map.iterrows():\n", " dt = datetime.fromtimestamp(row['created'])\n", " dt_str = dt.strftime('%Y-%m-%d %H:%M:%S')\n", " folium.Marker([row['lat'], row['lon']], popup = 'net:'+str(row['net'])+' ' +dt_str).add_to(map)\n", " count += 1\n", " if (count>1000): # 念の為。1000点以上だと動作困難になる。\n", " break\n", "print(count)\n", "\n", "map" ], "metadata": { "id": "BiryMaVWFRcJ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 日本での2022年の各ネットワーク(通信会社)毎の月別の個数の推移\n", "import matplotlib.pyplot as plt\n", "\n", "# unixtime カラムを datetime 型に変換\n", "dfjp['datetime'] = pd.to_datetime(dfjp['created'], unit='s')\n", "\n", "# 2022年のデータに絞り込む(会社もある程度は限定)\n", "df_2022 = dfjp[(dfjp['datetime'] >= '2022-01-01') & (dfjp['datetime'] < '2023-01-01') & (dfjp['net']<100) & (dfjp['net']>9)]\n", "\n", "# 月別に集計する\n", "count_by_month = df_2022.groupby([df_2022['datetime'].dt.strftime('%Y-%m'),'net'])['created'].count().unstack()\n", "\n", "# グラフを描画する\n", "# count_by_month.plot(kind='bar', figsize=(10,5),stacked=True)\n", "count_by_month.plot(kind='line', figsize=(18,8), marker='o', linewidth=4)\n", "plt.title('2022 Monthly')\n", "plt.xlabel('Month')\n", "plt.ylabel('Count')\n", "plt.show()\n" ], "metadata": { "id": "Tl-WrqDZW6ar" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# グラフの一部を検算\n", "# dfjp[(dfjp['datetime']>='2022-05-01') & (dfjp['datetime']<'2022-06-01') &(dfjp['net']==10)]\n", "# dfjp[(dfjp['datetime']>='2022-05-01') & (dfjp['datetime']<'2022-06-01')]" ], "metadata": { "id": "2Xv5vnBSlXa_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "wenTDzRzlbW8" }, "execution_count": null, "outputs": [] } ] }