From afdc99bfadc7e815a1f9aaa9e64ac0ad7274a1c1 Mon Sep 17 00:00:00 2001 From: KOSEUNGBIN Date: Sun, 11 Aug 2024 16:19:01 +0900 Subject: [PATCH] =?UTF-8?q?=EC=B5=9C=EC=A2=85=EA=B3=BC=EC=A0=9C=20?= =?UTF-8?q?=EC=A0=9C=EC=B6=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit a91998909cd4cfdd61ea6d6c694d5bde9d620593) --- ...ungbin2024_spark_study_final_project.ipynb | 660 ++++++++++++++++++ 1 file changed, 660 insertions(+) create mode 100644 koseungbin2024_spark_study_final_project.ipynb diff --git a/koseungbin2024_spark_study_final_project.ipynb b/koseungbin2024_spark_study_final_project.ipynb new file mode 100644 index 0000000..cd92cd3 --- /dev/null +++ b/koseungbin2024_spark_study_final_project.ipynb @@ -0,0 +1,660 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "mount_file_id": "1KgTVPmqkbxAz-6fOwB7PoxT_gy90j_1O", + "authorship_tag": "ABX9TyN2ezpxYneJMdQcTYSjghsD", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# [데이터셋]\n", + "- https://www.kaggle.com/datasets/brllrb/uber-and-lyft-dataset-boston-ma\n" + ], + "metadata": { + "id": "XnDy_QHUdXU_" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "uzPTXxUcByYJ" + }, + "outputs": [], + "source": [ + "!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n", + "!cp /content/drive/MyDrive/colab_notebooks/spark/spark-3.5.1-bin-hadoop3.tgz /content/\n", + "!tar xf spark-3.5.1-bin-hadoop3.tgz\n", + "!pip install -q findspark\n", + "\n", + "!cp -r /content/drive/MyDrive/colab_notebooks/sample_data /content/\n", + "\n", + "import os\n", + "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n", + "os.environ[\"SPARK_HOME\"] = \"/content/spark-3.5.1-bin-hadoop3\"" + ] + }, + { + "cell_type": "code", + "source": [ + "import findspark\n", + "findspark.init()\n", + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession \\\n", + " .builder \\\n", + " .master(\"local[*]\") \\\n", + " .getOrCreate()\n", + "\n", + "spark.conf.set(\"spark.sql.repl.eagerEval.enabled\", True)\n", + "spark" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 219 + }, + "id": "N8x70X_7B7vi", + "outputId": "0ef0e064-3996-4c51-e411-dc3a1ec547c4" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "
\n", + "

SparkSession - in-memory

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v3.5.1
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
pyspark-shell
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, BooleanType\n", + "\n", + "\n", + "raw_df = spark.read.csv(\n", + " './sample_data/yellow_taxi/rideshare_kaggle.csv',\n", + " header=True,\n", + " sep=',',\n", + " inferSchema=True\n", + ")\n", + "\n", + "# raw data\n", + "print(\"[raw data]\")\n", + "raw_df.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "N_FYosoLCz1U", + "outputId": "ce1dabc9-720d-4e3a-c58a-b178f91e17d6" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[raw data]\n", + "+--------------------+----------------+----+---+-----+-------------------+----------------+----------------+--------------------+--------+--------------------+------------+-----+--------+----------------+--------+---------+-----------+-------------------+---------------+--------------------+---------------+-----------------+--------+---------+--------+------------+----------+---------------+-------------------+--------------+------------------+-----------------------+---------------------------+----------------------+--------------------------+--------------------+--------+--------+-----------+----------+-------+------------+-----+-----------+----------+---------+------------------+-----------+--------------+------------------+--------------+------------------+----------------------+--------------------------+----------------------+--------------------------+\n", + "| id| timestamp|hour|day|month| datetime| timezone| source| destination|cab_type| product_id| name|price|distance|surge_multiplier|latitude|longitude|temperature|apparentTemperature| short_summary| long_summary|precipIntensity|precipProbability|humidity|windSpeed|windGust|windGustTime|visibility|temperatureHigh|temperatureHighTime|temperatureLow|temperatureLowTime|apparentTemperatureHigh|apparentTemperatureHighTime|apparentTemperatureLow|apparentTemperatureLowTime| icon|dewPoint|pressure|windBearing|cloudCover|uvIndex|visibility.1|ozone|sunriseTime|sunsetTime|moonPhase|precipIntensityMax|uvIndexTime|temperatureMin|temperatureMinTime|temperatureMax|temperatureMaxTime|apparentTemperatureMin|apparentTemperatureMinTime|apparentTemperatureMax|apparentTemperatureMaxTime|\n", + "+--------------------+----------------+----+---+-----+-------------------+----------------+----------------+--------------------+--------+--------------------+------------+-----+--------+----------------+--------+---------+-----------+-------------------+---------------+--------------------+---------------+-----------------+--------+---------+--------+------------+----------+---------------+-------------------+--------------+------------------+-----------------------+---------------------------+----------------------+--------------------------+--------------------+--------+--------+-----------+----------+-------+------------+-----+-----------+----------+---------+------------------+-----------+--------------+------------------+--------------+------------------+----------------------+--------------------------+----------------------+--------------------------+\n", + "|424553bb-7174-41e...| 1.54495260789E9| 9| 16| 12|2018-12-16 09:30:07|America/New_York|Haymarket Square| North Station| Lyft| lyft_line| Shared| 5| 0.44| 1.0| 42.2148| -71.033| 42.34| 37.12| Mostly Cloudy | Rain throughout ...| 0.0| 0.0| 0.68| 8.66| 9.17| 1545015600| 10.0| 43.68| 1544968800| 34.19| 1545048000| 37.95| 1544968800| 27.39| 1545044400| partly-cloudy-ni...| 32.7| 1021.98| 57| 0.72| 0| 10.0|303.8| 1544962084|1544994864| 0.3| 0.1276| 1544979600| 39.89| 1545012000| 43.68| 1544968800| 33.73| 1545012000| 38.07| 1544958000|\n", + "|4bd23055-6827-41c...|1.543284023677E9| 2| 27| 11|2018-11-27 02:00:23|America/New_York|Haymarket Square| North Station| Lyft| lyft_premier| Lux| 11| 0.44| 1.0| 42.2148| -71.033| 43.58| 37.35| Rain | Rain until morni...| 0.1299| 1.0| 0.94| 11.98| 11.98| 1543291200| 4.786| 47.3| 1543251600| 42.1| 1543298400| 43.92| 1543251600| 36.2| 1543291200| rain | 41.83| 1003.97| 90| 1.0| 0| 4.786|291.1| 1543232969|1543266992| 0.64| 0.13| 1543251600| 40.49| 1543233600| 47.3| 1543251600| 36.2| 1543291200| 43.92| 1543251600|\n", + "|981a3613-77af-462...|1.543366822198E9| 1| 28| 11|2018-11-28 01:00:22|America/New_York|Haymarket Square| North Station| Lyft| lyft| Lyft| 7| 0.44| 1.0| 42.2148| -71.033| 38.33| 32.93| Clear | Light rain in th...| 0.0| 0.0| 0.75| 7.33| 7.33| 1543334400| 10.0| 47.55| 1543320000| 33.1| 1543402800| 44.12| 1543320000| 29.11| 1543392000| clear-night | 31.1| 992.28| 240| 0.03| 0| 10.0|315.7| 1543319437|1543353364| 0.68| 0.1064| 1543338000| 35.36| 1543377600| 47.55| 1543320000| 31.04| 1543377600| 44.12| 1543320000|\n", + "|c2d88af2-d278-4bf...|1.543553582749E9| 4| 30| 11|2018-11-30 04:53:02|America/New_York|Haymarket Square| North Station| Lyft| lyft_luxsuv|Lux Black XL| 26| 0.44| 1.0| 42.2148| -71.033| 34.38| 29.63| Clear | Partly cloudy th...| 0.0| 0.0| 0.73| 5.28| 5.28| 1543514400| 10.0| 45.03| 1543510800| 28.9| 1543579200| 38.53| 1543510800| 26.2| 1543575600| clear-night | 26.64| 1013.73| 310| 0.0| 0| 10.0|291.1| 1543492370|1543526114| 0.75| 0.0| 1543507200| 34.67| 1543550400| 45.03| 1543510800| 30.3| 1543550400| 38.53| 1543510800|\n", + "|e0126e1f-8ca9-4f2...|1.543463360223E9| 3| 29| 11|2018-11-29 03:49:20|America/New_York|Haymarket Square| North Station| Lyft| lyft_plus| Lyft XL| 9| 0.44| 1.0| 42.2148| -71.033| 37.44| 30.88| Partly Cloudy | Mostly cloudy th...| 0.0| 0.0| 0.7| 9.14| 9.14| 1543446000| 10.0| 42.18| 1543420800| 36.71| 1543478400| 35.75| 1543420800| 30.29| 1543460400| partly-cloudy-ni...| 28.61| 998.36| 303| 0.44| 0| 10.0|347.7| 1543405904|1543439738| 0.72| 1.0E-4| 1543420800| 33.1| 1543402800| 42.18| 1543420800| 29.11| 1543392000| 35.75| 1543420800|\n", + "|f6f6d7e4-3e18-492...|1.545071112138E9| 18| 17| 12|2018-12-17 18:25:12|America/New_York|Haymarket Square| North Station| Lyft| lyft_lux| Lux Black| 16.5| 0.44| 1.0| 42.2148| -71.033| 38.75| 33.51| Overcast | Light rain in th...| 0.0| 0.0| 0.84| 7.19| 8.88| 1545022800| 8.325| 40.61| 1545076800| 24.07| 1545130800| 34.97| 1545080400| 12.04| 1545134400| cloudy | 34.41| 1000.46| 294| 1.0| 1| 8.325|335.8| 1545048523|1545081282| 0.33| 0.0221| 1545066000| 34.19| 1545048000| 40.66| 1545022800| 27.39| 1545044400| 34.97| 1545080400|\n", + "|462816a3-820d-408...| 1.5432085802E9| 5| 26| 11|2018-11-26 05:03:00|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft_plus| Lyft XL| 10.5| 1.08| 1.0| 42.3503| -71.081| 41.99| 41.99| Overcast | Rain until morni...| 0.0| 0.0| 0.91| 0.53| 0.88| 1543287600| 4.675| 46.46| 1543255200| 42.17| 1543298400| 43.81| 1543251600| 37.08| 1543298400| cloudy | 39.54| 1014.11| 91| 1.0| 0| 4.675|312.3| 1543233004|1543266980| 0.64| 0.1245| 1543251600| 40.67| 1543233600| 46.46| 1543255200| 37.45| 1543291200| 43.81| 1543251600|\n", + "|474d6376-bc59-4ec...|1.543780384677E9| 19| 2| 12|2018-12-02 19:53:04|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft_lux| Lux Black| 16.5| 1.08| 1.0| 42.3503| -71.081| 49.88| 49.22| Light Rain | Light rain until...| 0.0246| 1.0| 0.93| 3.38| 3.38| 1543755600| 3.052| 50.8| 1543788000| 44.97| 1543816800| 50.13| 1543788000| 45.62| 1543816800| rain | 48.02| 1004.33| 159| 1.0| 0| 3.052|282.5| 1543751798|1543785242| 0.86| 0.0916| 1543770000| 36.32| 1543726800| 50.8| 1543788000| 35.84| 1543748400| 50.13| 1543788000|\n", + "|4f9fee41-fde3-476...|1.543818482645E9| 6| 3| 12|2018-12-03 06:28:02|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft_line| Shared| 3| 1.08| 1.0| 42.3503| -71.081| 45.58| 45.58| Foggy | Foggy in the mor...| 0.0| 0.0| 0.96| 1.25| 2.09| 1543856400| 1.413| 57.02| 1543852800| 33.74| 1543921200| 56.35| 1543852800| 28.53| 1543914000| fog | 44.5| 1001.06| 307| 1.0| 0| 1.413|290.9| 1543838259|1543871628| 0.89| 4.0E-4| 1543852800| 43.09| 1543896000| 57.02| 1543852800| 39.9| 1543896000| 56.35| 1543852800|\n", + "|8612d909-98b8-445...|1.543315522249E9| 10| 27| 11|2018-11-27 10:45:22|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft_luxsuv|Lux Black XL| 27.5| 1.08| 1.0| 42.3503| -71.081| 45.45| 41.77| Light Rain | Light rain in th...| 0.0624| 1.0| 0.93| 6.87| 7.42| 1543338000| 2.686| 46.91| 1543320000| 33.82| 1543399200| 44.01| 1543320000| 30.19| 1543399200| rain | 43.52| 989.98| 79| 1.0| 0| 2.686|296.2| 1543319472|1543353352| 0.68| 0.1425| 1543338000| 36.34| 1543377600| 46.91| 1543320000| 32.43| 1543377600| 44.01| 1543320000|\n", + "|9043bf77-1d45-4a9...|1.543594383882E9| 16| 30| 11|2018-11-30 16:13:03|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft_premier| Lux| 13.5| 1.08| 1.0| 42.3503| -71.081| 40.13| 38.0| Clear | Mostly cloudy th...| 0.0| 0.0| 0.62| 3.46| 4.47| 1543554000| 9.92| 42.32| 1543600800| 31.57| 1543665600| 40.48| 1543611600| 28.1| 1543658400| clear-day | 27.99| 1016.84| 291| 0.12| 2| 9.92|269.9| 1543578871|1543612479| 0.79| 4.0E-4| 1543593600| 28.64| 1543579200| 42.32| 1543600800| 29.29| 1543579200| 40.48| 1543611600|\n", + "|d859ec69-b3ff-4af...|1.543432987778E9| 19| 28| 11|2018-11-28 19:23:07|America/New_York| Back Bay|Northeastern Univ...| Lyft| lyft| Lyft| 7| 1.08| 1.0| 42.3503| -71.081| 41.47| 35.66| Overcast | Mostly cloudy th...| 0.0| 0.0| 0.63| 9.54| 14.86| 1543431600| 10.0| 42.72| 1543438800| 37.59| 1543485600| 36.75| 1543438800| 32.25| 1543478400| cloudy | 29.72| 991.85| 295| 1.0| 0| 10.0|354.2| 1543405940|1543439725| 0.72| 0.0| 1543420800| 33.82| 1543399200| 42.72| 1543438800| 30.19| 1543399200| 36.75| 1543438800|\n", + "|009e9c53-074d-43c...|1.543615981179E9| 22| 30| 11|2018-11-30 22:13:01|America/New_York| North End| West End| Uber|6f72dfc5-27f1-42e...| UberXL| 12| 1.11| 1.0| 42.3647| -71.0542| 40.13| 38.08| Overcast | Mostly cloudy th...| 0.0| 0.0| 0.6| 3.38| 3.99| 1543554000| 9.833| 42.52| 1543600800| 31.71| 1543658400| 40.53| 1543611600| 28.06| 1543658400| cloudy | 27.31| 1017.16| 281| 1.0| 0| 9.833|281.8| 1543578867|1543612470| 0.79| 3.0E-4| 1543593600| 28.79| 1543579200| 42.52| 1543600800| 26.41| 1543575600| 40.53| 1543611600|\n", + "|23f145da-f0c1-4d1...|1.544698211014E9| 10| 13| 12|2018-12-13 10:50:11|America/New_York| North End| West End| Uber|6c84fd89-3f11-478...| Black| 16| 1.11| 1.0| 42.3647| -71.0542| 20.38| 20.38| Clear | Partly cloudy th...| 0.0| 0.0| 0.66| 2.94| 3.22| 1544738400| 9.831| 33.83| 1544731200| 27.27| 1544781600| 32.85| 1544734800| 24.61| 1544785200| clear-night | 10.87| 1031.51| 2| 0.03| 0| 9.831|327.3| 1544702792|1544735599| 0.21| 1.0E-4| 1544716800| 18.29| 1544688000| 33.83| 1544731200| 13.79| 1544688000| 32.85| 1544734800|\n", + "|357559cb-8c58-427...|1.544728503935E9| 19| 13| 12|2018-12-13 19:15:03|America/New_York| North End| West End| Uber|55c66225-fbe7-4fd...| UberX| 7.5| 1.11| 1.0| 42.3647| -71.0542| 32.85| 32.85| Mostly Cloudy | Partly cloudy th...| 0.0| 0.0| 0.56| 2.65| 3.83| 1544738400| 9.959| 33.83| 1544731200| 27.27| 1544781600| 32.85| 1544734800| 24.61| 1544785200| partly-cloudy-day | 18.66| 1033.65| 76| 0.64| 0| 9.959|330.8| 1544702792|1544735599| 0.21| 1.0E-4| 1544716800| 18.29| 1544688000| 33.83| 1544731200| 13.79| 1544688000| 32.85| 1544734800|\n", + "|50ef1165-9d23-416...| 1.54500451143E9| 23| 16| 12|2018-12-16 23:55:11|America/New_York| North End| West End| Uber|9a0e7b09-b92b-4c4...| WAV| 7.5| 1.11| 1.0| 42.3647| -71.0542| 41.29| 36.01| Light Rain | Rain throughout ...| 0.0567| 0.94| 0.86| 8.3| 8.3| 1545015600| 4.054| 43.83| 1544990400| 34.25| 1545044400| 38.38| 1544986800| 28.3| 1545044400| rain | 37.56| 1012.72| 57| 1.0| 0| 4.054|325.3| 1544962119|1544994839| 0.3| 0.1252| 1544979600| 39.22| 1544954400| 43.83| 1544990400| 33.98| 1545019200| 38.38| 1544986800|\n", + "|91c4861c-1780-42b...|1.544748007961E9| 0| 14| 12|2018-12-14 00:40:07|America/New_York| North End| West End| Uber|6d318bcc-22a3-4af...| Black SUV| 26| 1.11| 1.0| 42.3647| -71.0542| 31.25| 31.25| Overcast | Partly cloudy th...| 0.0| 0.0| 0.64| 2.62| 3.54| 1544738400| 10.0| 33.83| 1544731200| 27.27| 1544781600| 32.85| 1544734800| 24.61| 1544785200| cloudy | 20.53| 1035.06| 173| 0.91| 0| 10.0|326.7| 1544702792|1544735599| 0.21| 1.0E-4| 1544716800| 18.29| 1544688000| 33.83| 1544731200| 13.79| 1544688000| 32.85| 1544734800|\n", + "|e219e545-a006-493...|1.543519080802E9| 19| 29| 11|2018-11-29 19:18:00|America/New_York| North End| West End| Uber|997acbb5-e102-41e...| UberPool| 5.5| 1.11| 1.0| 42.3647| -71.0542| 43.49| 37.19| Mostly Cloudy | Partly cloudy th...| 0.0| 0.0| 0.52| 12.13| 19.97| 1543514400| 9.796| 44.61| 1543510800| 28.79| 1543579200| 38.21| 1543510800| 26.41| 1543575600| partly-cloudy-day | 26.83| 1007.12| 313| 0.53| 0| 9.796|309.7| 1543492402|1543526092| 0.75| 0.0| 1543510800| 35.35| 1543550400| 44.61| 1543510800| 31.14| 1543550400| 38.21| 1543510800|\n", + "|fa5fb705-03a0-4eb...|1.543673584211E9| 14| 1| 12|2018-12-01 14:13:04|America/New_York| North End| West End| Uber|8cf7e821-f0d3-49c...| Taxi| NA| 1.11| 1.0| 42.3647| -71.0542| 36.99| 32.27| Partly Cloudy | Light rain in th...| 0.0| 0.0| 0.68| 5.87| 6.26| 1543672800| 9.91| 44.66| 1543690800| 35.04| 1543712400| 43.99| 1543690800| 35.69| 1543712400| partly-cloudy-day | 27.53| 1022.32| 344| 0.44| 1| 9.91|280.1| 1543665331|1543698851| 0.82| 0.0| 1543683600| 31.71| 1543658400| 44.66| 1543690800| 28.06| 1543658400| 43.99| 1543690800|\n", + "|18d580ac-c91a-4b6...|1.544940911553E9| 6| 16| 12|2018-12-16 06:15:11|America/New_York| North Station| Haymarket Square| Lyft| lyft_plus| Lyft XL| 11| 0.72| 1.0| 42.3661| -71.0631| 40.36| 35.52| Clear | Rain throughout ...| 0.0| 0.0| 0.69| 7.08| 8.47| 1545015600| 10.0| 43.78| 1544990400| 34.12| 1545044400| 38.39| 1544986800| 28.21| 1545044400| clear-night | 30.94| 1022.94| 52| 0.06| 0| 10.0|298.7| 1544962122|1544994841| 0.3| 0.1246| 1544979600| 38.94| 1544954400| 43.78| 1544990400| 33.76| 1545019200| 38.39| 1544986800|\n", + "+--------------------+----------------+----+---+-----+-------------------+----------------+----------------+--------------------+--------+--------------------+------------+-----+--------+----------------+--------+---------+-----------+-------------------+---------------+--------------------+---------------+-----------------+--------+---------+--------+------------+----------+---------------+-------------------+--------------+------------------+-----------------------+---------------------------+----------------------+--------------------------+--------------------+--------+--------+-----------+----------+-------+------------+-----+-----------+----------+---------+------------------+-----------+--------------+------------------+--------------+------------------+----------------------+--------------------------+----------------------+--------------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from pyspark.sql.functions import col, avg, count, year, datediff, expr, min, max, to_date, month, hour, cast, sum, rank, concat, lit\n", + "from pyspark.sql.window import Window\n", + "import pyspark.pandas as ps\n", + "\n", + "\n", + "# 공급자 측면 조사\n", + "print(\"[공급자 측면 조사]\")\n", + "\n", + "## 평균 가격이 높았던 출발지와 도착지\n", + "print(\"[평균 가격이 많았던 출발지와 도착지]\")\n", + "result_df = raw_df \\\n", + " .withColumn('src_to_dest', concat(col('source'), lit('->'), col('destination'))) \\\n", + " .groupBy('src_to_dest') \\\n", + " .agg(avg('price').alias('avg_price')) \\\n", + " .orderBy('avg_price', ascending=False);\n", + "\n", + "result_df.show();\n", + "\n", + "### 결과 그래프\n", + "ps \\\n", + " .DataFrame(result_df) \\\n", + " .set_index('src_to_dest') \\\n", + " .avg_price \\\n", + " .plot \\\n", + " .bar() \\\n", + " .show();\n", + "\n", + "\n", + "## 월(month) 별 택시 회사 매출 비교\n", + "print(\"[월(month) 별 택시 회사 매출 비교]\")\n", + "result_df = raw_df \\\n", + " .withColumn('year', year(col('datetime'))) \\\n", + " .withColumn('month', month(col('datetime'))) \\\n", + " .withColumn('year_month', concat(col('year'), lit('-'), col('month'))) \\\n", + " .withColumn('year_month_by_cab_type', concat(col('year_month'), lit('/'), col('cab_type'))) \\\n", + " .groupBy('year_month_by_cab_type') \\\n", + " .agg(sum('price').alias('total_price')) \\\n", + " .orderBy(\"year_month_by_cab_type\");\n", + "\n", + "result_df.show()\n", + "\n", + "### 결과 그래프\n", + "ps \\\n", + " .DataFrame(result_df) \\\n", + " .set_index('year_month_by_cab_type') \\\n", + " .plot \\\n", + " .bar(y = 'total_price') \\\n", + " .show();\n", + "\n", + "## 매 시간(hour) 별 가장 콜이 많았던 장소\n", + "print(\"[매 시간(hour) 별 가장 콜이 많았던 장소]\")\n", + "windowSpec = Window \\\n", + " .partitionBy('date_hour') \\\n", + " .orderBy(col('count').desc()) \\\n", + " .rowsBetween(Window.unboundedPreceding, Window.currentRow)\n", + "\n", + "result_df = raw_df \\\n", + " .withColumn('date', to_date(col('datetime'))) \\\n", + " .withColumn('hour', hour(col('datetime'))) \\\n", + " .withColumn('date_hour', concat(col('date'), lit('/'), col('hour'))) \\\n", + " .groupBy('date_hour', 'source') \\\n", + " .agg(count('source').alias('count')) \\\n", + " .withColumn('rank', rank().over(windowSpec)) \\\n", + " .select('date_hour', 'source') \\\n", + " .orderBy(\"date_hour\") \\\n", + " .filter(col('rank') == 1)\n", + "\n", + "result_df.orderBy('date_hour').show()\n", + "\n", + "### 결과 그래프\n", + "ps \\\n", + " .DataFrame(result_df) \\\n", + " .plot \\\n", + " .scatter(x = 'date_hour', y = 'source', s = 'count') \\\n", + " .show();\n", + "\n", + "\n", + "### 검증용\n", + "# raw_df.withColumn('date', to_date(col('datetime'))) \\\n", + "# .withColumn('hour', hour(col('datetime'))) \\\n", + "# .groupBy('date', 'hour', 'source') \\\n", + "# .agg(count('source').alias('count')) \\\n", + "# .orderBy('date', 'hour', col('count').desc()) \\\n", + "# .show(40)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "collapsed": true, + "id": "eR6y6Rqfc_-5", + "outputId": "9d91be6d-09a5-4065-85c3-2a6614ec3c2a" + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[공급자 측면 조사]\n", + "[평균 가격이 많았던 출발지와 도착지]\n", + "+--------------------+------------------+\n", + "| src_to_dest| avg_price|\n", + "+--------------------+------------------+\n", + "|Financial Distric...|25.029096477794795|\n", + "|Boston University...|24.039182282793867|\n", + "|Financial Distric...|23.626237623762375|\n", + "|Fenway->Financial...|23.088291746641076|\n", + "|Northeastern Univ...|22.499134948096884|\n", + "|Financial Distric...|21.520358306188925|\n", + "|Theatre District-...| 20.76215277777778|\n", + "|Boston University...|20.310986964618248|\n", + "| North End->Back Bay|19.762027491408936|\n", + "| Back Bay->North End| 19.73857404021938|\n", + "|South Station->Ba...|19.439338235294116|\n", + "|Fenway->North Sta...|19.414495114006513|\n", + "|Northeastern Univ...|19.221014492753625|\n", + "|North Station->Bo...|19.073701842546065|\n", + "|Theatre District-...|19.066096423017107|\n", + "|North Station->No...| 19.06513409961686|\n", + "|West End->Northea...|18.954682779456192|\n", + "|Boston University...| 18.91390728476821|\n", + "|Boston University...| 18.82960413080895|\n", + "|North Station->Fe...|18.549924357034797|\n", + "+--------------------+------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[월(month) 별 택시 회사 매출 비교]\n", + "+----------------------+-----------+\n", + "|year_month_by_cab_type|total_price|\n", + "+----------------------+-----------+\n", + "| 2018-11/Lyft| 154411.5|\n", + "| 2018-11/Uber| 141712.5|\n", + "| 2018-12/Lyft| 204014.0|\n", + "| 2018-12/Uber| 193503.0|\n", + "+----------------------+-----------+\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[매 시간(hour) 별 가장 콜이 많았던 장소]\n", + "+-------------+--------------------+\n", + "| date_hour| source|\n", + "+-------------+--------------------+\n", + "|2018-11-26/10| Back Bay|\n", + "|2018-11-26/10| Boston University|\n", + "|2018-11-26/11|Northeastern Univ...|\n", + "|2018-11-26/12| Back Bay|\n", + "|2018-11-26/13| Theatre District|\n", + "|2018-11-26/14| North Station|\n", + "|2018-11-26/15| Haymarket Square|\n", + "|2018-11-26/16| North Station|\n", + "|2018-11-26/17| South Station|\n", + "|2018-11-26/17| Fenway|\n", + "|2018-11-26/18| Theatre District|\n", + "|2018-11-26/19| Financial District|\n", + "|2018-11-26/20|Northeastern Univ...|\n", + "|2018-11-26/21| Beacon Hill|\n", + "|2018-11-26/22|Northeastern Univ...|\n", + "|2018-11-26/23| North Station|\n", + "| 2018-11-26/3| Theatre District|\n", + "| 2018-11-26/3| Haymarket Square|\n", + "| 2018-11-26/3| North Station|\n", + "| 2018-11-26/3| South Station|\n", + "+-------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "from pyspark.sql.functions import col, avg, count, year, datediff, expr, min, max, to_date, month, hour, cast, sum, rank\n", + "from pyspark.sql.window import Window\n", + "\n", + "# 수요자 측면 조사\n", + "print(\"[수요자 측면 조사]\")\n", + "\n", + "## 각 회사별 출발지, 목적지 평균 가격\n", + "print(\"[각 회사별 출발지, 목적지 평균 가격]\")\n", + "result_df = raw_df \\\n", + " .withColumn('src_to_dest_by_cab_type', concat(col('cab_type'), lit('('), col('source'), lit('->'), col('destination'), lit(')'))) \\\n", + " .groupBy('cab_type', 'source', 'destination', 'src_to_dest_by_cab_type') \\\n", + " .agg(avg('price').alias('avg_price')) \\\n", + " .orderBy('source', 'destination', 'cab_type')\n", + "\n", + "result_df.show(20)\n", + "\n", + "### 결과 그래프\n", + "ps \\\n", + " .DataFrame(result_df) \\\n", + " .set_index('src_to_dest_by_cab_type') \\\n", + " .plot \\\n", + " .bar(y = 'avg_price') \\\n", + " .show();\n", + "\n", + "\n", + "## 각 출발지 별로 콜이 잘 잡히는 회사\n", + "print(\"[각 출발지 별로 콜이 잘 잡히는 회사]\")\n", + "\n", + "windowSpec = Window \\\n", + " .partitionBy('source') \\\n", + " .orderBy(col('count').desc()) \\\n", + " .rowsBetween(Window.unboundedPreceding, Window.currentRow)\n", + "\n", + "raw_df \\\n", + " .groupBy('source', 'cab_type') \\\n", + " .count().alias('count') \\\n", + " .withColumn('rank', rank().over(windowSpec)) \\\n", + " .filter(col('rank') == 1) \\\n", + " .select('source', 'cab_type', 'count') \\\n", + " .orderBy('source') \\\n", + " .show()\n", + "\n", + "### 검증용\n", + "# raw_df \\\n", + "# .groupBy('source', 'cab_type') \\\n", + "# .count().alias('count') \\\n", + "# .orderBy('source', 'cab_type', 'count') \\\n", + "# .show(50)\n", + "\n", + "## 가성비가 좋은 택시 회사\n", + "print(\"[가성비가 좋은 택시 회사]\")\n", + "raw_df \\\n", + " .groupBy('cab_type') \\\n", + " .agg(avg(col('price') / col('distance')).alias('avg_price_per_distance')) \\\n", + " .orderBy('avg_price_per_distance') \\\n", + " .show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "RN-UVTb6dKRi", + "outputId": "9fbf1762-7e97-4958-f262-7d0bf2128a11" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[수요자 측면 조사]\n", + "[각 회사별 출발지, 목적지 평균 가격]\n", + "+--------+-----------+--------------------+-----------------------+------------------+\n", + "|cab_type| source| destination|src_to_dest_by_cab_type| avg_price|\n", + "+--------+-----------+--------------------+-----------------------+------------------+\n", + "| Lyft| Back Bay| Boston University| Lyft(Back Bay->Bo...|14.235887096774194|\n", + "| Uber| Back Bay| Boston University| Uber(Back Bay->Bo...| 13.0688202247191|\n", + "| Lyft| Back Bay| Fenway| Lyft(Back Bay->Fe...| 14.97003745318352|\n", + "| Uber| Back Bay| Fenway| Uber(Back Bay->Fe...|13.309602649006623|\n", + "| Lyft| Back Bay| Haymarket Square| Lyft(Back Bay->Ha...|18.686974789915965|\n", + "| Uber| Back Bay| Haymarket Square| Uber(Back Bay->Ha...|16.791208791208792|\n", + "| Lyft| Back Bay| North End| Lyft(Back Bay->No...|21.535714285714285|\n", + "| Uber| Back Bay| North End| Uber(Back Bay->No...|18.203389830508474|\n", + "| Lyft| Back Bay|Northeastern Univ...| Lyft(Back Bay->No...|13.797297297297296|\n", + "| Uber| Back Bay|Northeastern Univ...| Uber(Back Bay->No...|12.613505747126437|\n", + "| Lyft| Back Bay| South Station| Lyft(Back Bay->So...|16.437037037037037|\n", + "| Uber| Back Bay| South Station| Uber(Back Bay->So...| 18.806640625|\n", + "| Lyft|Beacon Hill| Boston University| Lyft(Beacon Hill-...| 17.11919504643963|\n", + "| Uber|Beacon Hill| Boston University| Uber(Beacon Hill-...|15.539094650205762|\n", + "| Lyft|Beacon Hill| Fenway| Lyft(Beacon Hill-...|16.912225705329153|\n", + "| Uber|Beacon Hill| Fenway| Uber(Beacon Hill-...| 15.5893536121673|\n", + "| Lyft|Beacon Hill| Haymarket Square| Lyft(Beacon Hill-...|14.200607902735563|\n", + "| Uber|Beacon Hill| Haymarket Square| Uber(Beacon Hill-...| 13.57312925170068|\n", + "| Lyft|Beacon Hill| North End| Lyft(Beacon Hill-...|15.623475609756097|\n", + "| Uber|Beacon Hill| North End| Uber(Beacon Hill-...|15.053763440860216|\n", + "+--------+-----------+--------------------+-----------------------+------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[각 출발지 별로 콜이 잘 잡히는 회사]\n", + "+--------------------+--------+-----+\n", + "| source|cab_type|count|\n", + "+--------------------+--------+-----+\n", + "| Back Bay| Uber| 2135|\n", + "| Beacon Hill| Uber| 1905|\n", + "| Boston University| Uber| 2056|\n", + "| Fenway| Uber| 2112|\n", + "| Financial District| Uber| 2058|\n", + "| Haymarket Square| Uber| 2171|\n", + "| North End| Uber| 2061|\n", + "| North Station| Uber| 1990|\n", + "|Northeastern Univ...| Uber| 2139|\n", + "| South Station| Uber| 1871|\n", + "| Theatre District| Uber| 2252|\n", + "| West End| Uber| 2076|\n", + "+--------------------+--------+-----+\n", + "\n", + "[가성비가 좋은 택시 회사]\n", + "+--------+----------------------+\n", + "|cab_type|avg_price_per_distance|\n", + "+--------+----------------------+\n", + "| Uber| 9.682674403228246|\n", + "| Lyft| 9.71325494447119|\n", + "+--------+----------------------+\n", + "\n" + ] + } + ] + } + ] +} \ No newline at end of file