From 1335010b52f6147b3810aa8ab2ceb90559cbb963 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 25 Jul 2018 11:21:10 +0200 Subject: [PATCH] update to DataFrames 0.11.7 --- 01_constructors.ipynb | 184 ++++++++++++++++++++++----------------- 02_basicinfo.ipynb | 171 +++++++++++++++---------------------- 03_missingvalues.ipynb | 64 ++++++++------ 04_loadsave.ipynb | 31 +++---- 05_columns.ipynb | 190 +++++++++++++++++++++++------------------ 08_joins.ipynb | 58 +++++++++---- 10_transforms.ipynb | 2 +- README.md | 13 ++- 8 files changed, 378 insertions(+), 335 deletions(-) diff --git a/01_constructors.ipynb b/01_constructors.ipynb index baecd5a..70a5726 100644 --- a/01_constructors.ipynb +++ b/01_constructors.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "# Introduction to DataFrames\n", - "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018**\n", + "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), July 25, 2018**\n", "\n", "Let's get started by loading the `DataFrames` package." ] @@ -13,9 +13,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "using DataFrames" @@ -77,15 +75,15 @@ { "data": { "text/html": [ - "
ABC
110.865057Jds
220.870442hDG
330.128666oyt
" + "
ABC
110.582939eDh
220.899657Uma
330.873748RnO
" ], "text/plain": [ "3×3 DataFrames.DataFrame\n", "│ Row │ A │ B │ C │\n", "├─────┼───┼──────────┼─────┤\n", - "│ 1 │ 1 │ 0.865057 │ Jds │\n", - "│ 2 │ 2 │ 0.870442 │ hDG │\n", - "│ 3 │ 3 │ 0.128666 │ oyt │" + "│ 1 │ 1 │ 0.582939 │ eDh │\n", + "│ 2 │ 2 │ 0.899657 │ Uma │\n", + "│ 3 │ 3 │ 0.873748 │ RnO │" ] }, "execution_count": 3, @@ -183,15 +181,15 @@ { "data": { "text/html": [ - "
x1x2x3
10.7389690.4763960.926968
20.4985590.1900630.839678
30.09577120.8431560.120698
" + "
x1x2x3
10.2914770.5899150.321527
20.2306790.03877160.539359
30.4815370.4948670.649062
" ], "text/plain": [ "3×3 DataFrames.DataFrame\n", - "│ Row │ x1 │ x2 │ x3 │\n", - "├─────┼───────────┼──────────┼──────────┤\n", - "│ 1 │ 0.738969 │ 0.476396 │ 0.926968 │\n", - "│ 2 │ 0.498559 │ 0.190063 │ 0.839678 │\n", - "│ 3 │ 0.0957712 │ 0.843156 │ 0.120698 │" + "│ Row │ x1 │ x2 │ x3 │\n", + "├─────┼──────────┼───────────┼──────────┤\n", + "│ 1 │ 0.291477 │ 0.589915 │ 0.321527 │\n", + "│ 2 │ 0.230679 │ 0.0387716 │ 0.539359 │\n", + "│ 3 │ 0.481537 │ 0.494867 │ 0.649062 │" ] }, "execution_count": 6, @@ -222,11 +220,11 @@ "\u001b[1m\u001b[33mWARNING: \u001b[39m\u001b[22m\u001b[33mpassing columns argument with non-AbstractVector entries is deprecated\u001b[39m\n", "Stacktrace:\n", " [1] \u001b[1mdepwarn\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::Symbol\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\deprecated.jl:70\u001b[22m\u001b[22m\n", - " [2] \u001b[1m#DataFrame#57\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Bool, ::Type{T} where T, ::Array{Float64,1}, ::Array{Symbol,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:154\u001b[22m\u001b[22m\n", + " [2] \u001b[1m#DataFrame#62\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Bool, ::Type{T} where T, ::Array{Float64,1}, ::Array{Symbol,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:154\u001b[22m\u001b[22m\n", " [3] \u001b[1mDataFrames.DataFrame\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Float64,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:152\u001b[22m\u001b[22m\n", " [4] \u001b[1minclude_string\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::String\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\loading.jl:522\u001b[22m\u001b[22m\n", " [5] \u001b[1mexecute_request\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::ZMQ.Socket, ::IJulia.Msg\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\IJulia\\src\\execute_request.jl:158\u001b[22m\u001b[22m\n", - " [6] \u001b[1m(::Compat.#inner#17{Array{Any,1},IJulia.#execute_request,Tuple{ZMQ.Socket,IJulia.Msg}})\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\Compat\\src\\Compat.jl:385\u001b[22m\u001b[22m\n", + " [6] \u001b[1m(::Compat.#inner#14{Array{Any,1},IJulia.#execute_request,Tuple{ZMQ.Socket,IJulia.Msg}})\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\Compat\\src\\Compat.jl:332\u001b[22m\u001b[22m\n", " [7] \u001b[1meventloop\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::ZMQ.Socket\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\IJulia\\src\\eventloop.jl:8\u001b[22m\u001b[22m\n", " [8] \u001b[1m(::IJulia.##14#17)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\task.jl:335\u001b[22m\u001b[22m\n", "while loading In[7], in expression starting on line 1\n" @@ -235,13 +233,13 @@ { "data": { "text/html": [ - "
x1x2x3
10.6136730.93880.976714
" + "
x1x2x3
10.6946140.3051830.375302
" ], "text/plain": [ "1×3 DataFrames.DataFrame\n", - "│ Row │ x1 │ x2 │ x3 │\n", - "├─────┼──────────┼────────┼──────────┤\n", - "│ 1 │ 0.613673 │ 0.9388 │ 0.976714 │" + "│ Row │ x1 │ x2 │ x3 │\n", + "├─────┼──────────┼──────────┼──────────┤\n", + "│ 1 │ 0.694614 │ 0.305183 │ 0.375302 │" ] }, "execution_count": 7, @@ -283,7 +281,7 @@ } ], "source": [ - "DataFrame(transpose([1, 2, 3]))" + "DataFrame(transpose([1, 2, 3])) # permutedims in Julia 0.7" ] }, { @@ -336,15 +334,15 @@ { "data": { "text/html": [ - "
x1x2x3x4
10.8745570.7772460.9494670.697868
20.5791640.8160290.1914660.0563065
30.2807770.7957160.2013090.191633
" + "
x1x2x3x4
10.9577680.2220080.5692150.553819
20.08859230.498240.612310.985774
30.1520640.8045750.8259470.653275
" ], "text/plain": [ "3×4 DataFrames.DataFrame\n", - "│ Row │ x1 │ x2 │ x3 │ x4 │\n", - "├─────┼──────────┼──────────┼──────────┼───────────┤\n", - "│ 1 │ 0.874557 │ 0.777246 │ 0.949467 │ 0.697868 │\n", - "│ 2 │ 0.579164 │ 0.816029 │ 0.191466 │ 0.0563065 │\n", - "│ 3 │ 0.280777 │ 0.795716 │ 0.201309 │ 0.191633 │" + "│ Row │ x1 │ x2 │ x3 │ x4 │\n", + "├─────┼───────────┼──────────┼──────────┼──────────┤\n", + "│ 1 │ 0.957768 │ 0.222008 │ 0.569215 │ 0.553819 │\n", + "│ 2 │ 0.0885923 │ 0.49824 │ 0.61231 │ 0.985774 │\n", + "│ 3 │ 0.152064 │ 0.804575 │ 0.825947 │ 0.653275 │" ] }, "execution_count": 10, @@ -371,15 +369,15 @@ { "data": { "text/html": [ - "
abcd
10.6273360.8503840.2251640.617465
20.6450450.7095810.07804680.0941601
30.485630.6083780.7772130.630866
" + "
abcd
10.1887870.9039360.5007360.811463
20.3413020.5570990.419540.382749
30.07513780.4301650.2935010.253147
" ], "text/plain": [ "3×4 DataFrames.DataFrame\n", - "│ Row │ a │ b │ c │ d │\n", - "├─────┼──────────┼──────────┼───────────┼───────────┤\n", - "│ 1 │ 0.627336 │ 0.850384 │ 0.225164 │ 0.617465 │\n", - "│ 2 │ 0.645045 │ 0.709581 │ 0.0780468 │ 0.0941601 │\n", - "│ 3 │ 0.48563 │ 0.608378 │ 0.777213 │ 0.630866 │" + "│ Row │ a │ b │ c │ d │\n", + "├─────┼───────────┼──────────┼──────────┼──────────┤\n", + "│ 1 │ 0.188787 │ 0.903936 │ 0.500736 │ 0.811463 │\n", + "│ 2 │ 0.341302 │ 0.557099 │ 0.41954 │ 0.382749 │\n", + "│ 3 │ 0.0751378 │ 0.430165 │ 0.293501 │ 0.253147 │" ] }, "execution_count": 11, @@ -408,13 +406,13 @@ { "data": { "text/html": [ - "
ABC
11472598881.08013e-319missing
" + "
ABC
100.0missing
" ], "text/plain": [ "1×3 DataFrames.DataFrame\n", - "│ Row │ A │ B │ C │\n", - "├─────┼───────────┼──────────────┼─────────┤\n", - "│ 1 │ 147259888 │ 1.08013e-319 │ \u001b[90mmissing\u001b[39m │" + "│ Row │ A │ B │ C │\n", + "├─────┼───┼─────┼─────────┤\n", + "│ 1 │ 0 │ 0.0 │ \u001b[90mmissing\u001b[39m │" ] }, "execution_count": 12, @@ -430,9 +428,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here we create a `DataFrame`, but column `:C` is #undef and Jupyter has problem with displaying it. (This works OK at the REPL.)\n", - "\n", - "This will be fixed in next release of DataFrames!" + "Here we create a `DataFrame` where `:C` is #undef" ] }, { @@ -441,24 +437,20 @@ "metadata": {}, "outputs": [ { - "ename": "UndefRefError", - "evalue": "\u001b[91mUndefRefError: access to undefined reference\u001b[39m", - "output_type": "error", - "traceback": [ - "\u001b[91mUndefRefError: access to undefined reference\u001b[39m", - "", - "Stacktrace:", - " [1] \u001b[1mgetindex\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{String,1}, ::Int64\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\array.jl:554\u001b[22m\u001b[22m", - " [2] \u001b[1mgetindex\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::DataFrames.DataFrame, ::Int64, ::Symbol\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:275\u001b[22m\u001b[22m", - " [3] \u001b[1mshow\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::IOContext{Base.AbstractIOBuffer{Array{UInt8,1}}}, ::MIME{Symbol(\"text/html\")}, ::DataFrames.DataFrame\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\abstractdataframe\\io.jl:110\u001b[22m\u001b[22m", - " [4] \u001b[1mlimitstringmime\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::MIME{Symbol(\"text/html\")}, ::DataFrames.DataFrame\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\IJulia\\src\\inline.jl:24\u001b[22m\u001b[22m", - " [5] \u001b[1mdisplay_dict\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::DataFrames.DataFrame\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\IJulia\\src\\execute_request.jl:43\u001b[22m\u001b[22m", - " [6] \u001b[1m(::Compat.#inner#17{Array{Any,1},IJulia.#display_dict,Tuple{DataFrames.DataFrame}})\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\Compat\\src\\Compat.jl:385\u001b[22m\u001b[22m", - " [7] \u001b[1mexecute_request\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::ZMQ.Socket, ::IJulia.Msg\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\IJulia\\src\\execute_request.jl:186\u001b[22m\u001b[22m", - " [8] \u001b[1m(::Compat.#inner#17{Array{Any,1},IJulia.#execute_request,Tuple{ZMQ.Socket,IJulia.Msg}})\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\Compat\\src\\Compat.jl:385\u001b[22m\u001b[22m", - " [9] \u001b[1meventloop\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::ZMQ.Socket\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\IJulia\\src\\eventloop.jl:8\u001b[22m\u001b[22m", - " [10] \u001b[1m(::IJulia.##14#17)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\task.jl:335\u001b[22m\u001b[22m" - ] + "data": { + "text/html": [ + "
ABC
1-12.03553e-315#undef
" + ], + "text/plain": [ + "1×3 DataFrames.DataFrame\n", + "│ Row │ A │ B │ C │\n", + "├─────┼────┼──────────────┼────────┤\n", + "│ 1 │ -1 │ 2.03553e-315 │ #undef │" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -510,15 +502,15 @@ { "data": { "text/html": [ - "
x1x2x3x4x5
120256884819986612816492674416640439701264
214729236814729236800148642768
314729307214729243200439701424
" + "
x1x2x3x4x5
1166116144165712048166110096169519056165712048
2164408368167317968111313296111313680164408368
3111313296141233040141233200141238528141233232
" ], "text/plain": [ "3×5 DataFrames.DataFrame\n", - "│ Row │ x1 │ x2 │ x3 │ x4 │ x5 │\n", - "├─────┼───────────┼───────────┼───────────────┼────┼───────────┤\n", - "│ 1 │ 202568848 │ 199866128 │ 1649267441664 │ 0 │ 439701264 │\n", - "│ 2 │ 147292368 │ 147292368 │ 0 │ 0 │ 148642768 │\n", - "│ 3 │ 147293072 │ 147292432 │ 0 │ 0 │ 439701424 │" + "│ Row │ x1 │ x2 │ x3 │ x4 │ x5 │\n", + "├─────┼───────────┼───────────┼───────────┼───────────┼───────────┤\n", + "│ 1 │ 166116144 │ 165712048 │ 166110096 │ 169519056 │ 165712048 │\n", + "│ 2 │ 164408368 │ 167317968 │ 111313296 │ 111313680 │ 164408368 │\n", + "│ 3 │ 111313296 │ 141233040 │ 141233200 │ 141238528 │ 141233232 │" ] }, "execution_count": 15, @@ -545,16 +537,16 @@ { "data": { "text/html": [ - "
x1x2
11486446882.17246e-315
21472613282.17242e-315
31472613282.17246e-315
402.20078e-315
" + "
x1x2
11666925288.26715e-316
21412347682.11841e-315
31114112806.97793e-316
41441179200.0
" ], "text/plain": [ "4×2 DataFrames.DataFrame\n", "│ Row │ x1 │ x2 │\n", "├─────┼───────────┼──────────────┤\n", - "│ 1 │ 148644688 │ 2.17246e-315 │\n", - "│ 2 │ 147261328 │ 2.17242e-315 │\n", - "│ 3 │ 147261328 │ 2.17246e-315 │\n", - "│ 4 │ 0 │ 2.20078e-315 │" + "│ 1 │ 166692528 │ 8.26715e-316 │\n", + "│ 2 │ 141234768 │ 2.11841e-315 │\n", + "│ 3 │ 111411280 │ 6.97793e-316 │\n", + "│ 4 │ 144117920 │ 0.0 │" ] }, "execution_count": 16, @@ -850,7 +842,7 @@ "\u001b[91mcannot convert a DataFrame containing missing values to array (found for column y)\u001b[39m", "", "Stacktrace:", - " [1] \u001b[1mconvert\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Type{Array{Int64,2}}, ::DataFrames.DataFrame\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\abstractdataframe\\abstractdataframe.jl:626\u001b[22m\u001b[22m", + " [1] \u001b[1mconvert\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Type{Array{Int64,2}}, ::DataFrames.DataFrame\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\abstractdataframe\\abstractdataframe.jl:716\u001b[22m\u001b[22m", " [2] \u001b[1mArray{Int64,2}\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::DataFrames.DataFrame\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\sysimg.jl:77\u001b[22m\u001b[22m" ] } @@ -913,15 +905,15 @@ "\u001b[1m\u001b[33mWARNING: \u001b[39m\u001b[22m\u001b[33mDuplicate variable names are deprecated: pass makeunique=true to add a suffix automatically.\u001b[39m\n", "Stacktrace:\n", " [1] \u001b[1mdepwarn\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::Symbol\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\deprecated.jl:70\u001b[22m\u001b[22m\n", - " [2] \u001b[1m#make_unique#3\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Bool, ::Function, ::Array{Symbol,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\other\\utils.jl:61\u001b[22m\u001b[22m\n", + " [2] \u001b[1m#make_unique#3\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Bool, ::Function, ::Array{Symbol,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\other\\utils.jl:64\u001b[22m\u001b[22m\n", " [3] \u001b[1m(::DataFrames.#kw##make_unique)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Any,1}, ::DataFrames.#make_unique, ::Array{Symbol,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\:0\u001b[22m\u001b[22m\n", " [4] \u001b[1m#Index#6\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\other\\index.jl:12\u001b[22m\u001b[22m [inlined]\n", " [5] \u001b[1m(::Core.#kw#Type)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Any,1}, ::Type{DataFrames.Index}, ::Array{Symbol,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\:0\u001b[22m\u001b[22m\n", - " [6] \u001b[1m#DataFrame#47\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Bool, ::Type{T} where T, ::Pair{Symbol,Int64}, ::Vararg{Pair{Symbol,Int64},N} where N\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:126\u001b[22m\u001b[22m\n", + " [6] \u001b[1m#DataFrame#52\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Bool, ::Type{T} where T, ::Pair{Symbol,Int64}, ::Vararg{Pair{Symbol,Int64},N} where N\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:126\u001b[22m\u001b[22m\n", " [7] \u001b[1mDataFrames.DataFrame\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Pair{Symbol,Int64}, ::Pair{Symbol,Int64}, ::Pair{Symbol,Int64}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:124\u001b[22m\u001b[22m\n", " [8] \u001b[1minclude_string\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::String\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\loading.jl:522\u001b[22m\u001b[22m\n", " [9] \u001b[1mexecute_request\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::ZMQ.Socket, ::IJulia.Msg\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\IJulia\\src\\execute_request.jl:158\u001b[22m\u001b[22m\n", - " [10] \u001b[1m(::Compat.#inner#17{Array{Any,1},IJulia.#execute_request,Tuple{ZMQ.Socket,IJulia.Msg}})\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\Compat\\src\\Compat.jl:385\u001b[22m\u001b[22m\n", + " [10] \u001b[1m(::Compat.#inner#14{Array{Any,1},IJulia.#execute_request,Tuple{ZMQ.Socket,IJulia.Msg}})\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\Compat\\src\\Compat.jl:332\u001b[22m\u001b[22m\n", " [11] \u001b[1meventloop\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::ZMQ.Socket\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\IJulia\\src\\eventloop.jl:8\u001b[22m\u001b[22m\n", " [12] \u001b[1m(::IJulia.##14#17)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\task.jl:335\u001b[22m\u001b[22m\n", "while loading In[28], in expression starting on line 1\n" @@ -968,17 +960,17 @@ "\u001b[1m\u001b[33mWARNING: \u001b[39m\u001b[22m\u001b[33mDuplicate variable names are deprecated: pass makeunique=true to add a suffix automatically.\u001b[39m\n", "Stacktrace:\n", " [1] \u001b[1mdepwarn\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::Symbol\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\deprecated.jl:70\u001b[22m\u001b[22m\n", - " [2] \u001b[1m#make_unique#3\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Bool, ::Function, ::Array{Symbol,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\other\\utils.jl:61\u001b[22m\u001b[22m\n", + " [2] \u001b[1m#make_unique#3\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Bool, ::Function, ::Array{Symbol,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\other\\utils.jl:64\u001b[22m\u001b[22m\n", " [3] \u001b[1m(::DataFrames.#kw##make_unique)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Any,1}, ::DataFrames.#make_unique, ::Array{Symbol,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\:0\u001b[22m\u001b[22m\n", " [4] \u001b[1m#Index#6\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\other\\index.jl:12\u001b[22m\u001b[22m [inlined]\n", " [5] \u001b[1m(::Core.#kw#Type)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Any,1}, ::Type{DataFrames.Index}, ::Array{Symbol,1}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\:0\u001b[22m\u001b[22m\n", - " [6] \u001b[1m#DataFrame#47\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Bool, ::Type{T} where T, ::Pair{Symbol,Int64}, ::Vararg{Pair{Symbol,#s8} where #s8,N} where N\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:126\u001b[22m\u001b[22m\n", + " [6] \u001b[1m#DataFrame#52\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Bool, ::Type{T} where T, ::Pair{Symbol,Int64}, ::Vararg{Pair{Symbol,#s8} where #s8,N} where N\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:126\u001b[22m\u001b[22m\n", " [7] \u001b[1mDataFrames.DataFrame\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Pair{Symbol,Int64}, ::Pair{Symbol,Int64}, ::Pair{Symbol,Bool}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:124\u001b[22m\u001b[22m\n", - " [8] \u001b[1m#DataFrame#56\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:145\u001b[22m\u001b[22m [inlined]\n", + " [8] \u001b[1m#DataFrame#61\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe\\dataframe.jl:145\u001b[22m\u001b[22m [inlined]\n", " [9] \u001b[1m(::Core.#kw#Type)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Any,1}, ::Type{DataFrames.DataFrame}\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\:0\u001b[22m\u001b[22m\n", " [10] \u001b[1minclude_string\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::String, ::String\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\loading.jl:522\u001b[22m\u001b[22m\n", " [11] \u001b[1mexecute_request\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::ZMQ.Socket, ::IJulia.Msg\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\IJulia\\src\\execute_request.jl:158\u001b[22m\u001b[22m\n", - " [12] \u001b[1m(::Compat.#inner#17{Array{Any,1},IJulia.#execute_request,Tuple{ZMQ.Socket,IJulia.Msg}})\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\Compat\\src\\Compat.jl:385\u001b[22m\u001b[22m\n", + " [12] \u001b[1m(::Compat.#inner#14{Array{Any,1},IJulia.#execute_request,Tuple{ZMQ.Socket,IJulia.Msg}})\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\Compat\\src\\Compat.jl:332\u001b[22m\u001b[22m\n", " [13] \u001b[1meventloop\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::ZMQ.Socket\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\IJulia\\src\\eventloop.jl:8\u001b[22m\u001b[22m\n", " [14] \u001b[1m(::IJulia.##14#17)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\task.jl:335\u001b[22m\u001b[22m\n", "while loading In[29], in expression starting on line 1\n" @@ -1004,11 +996,45 @@ "source": [ "df = DataFrame(a=1, a=2, makeunique=true)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finallly observe that `nothing` is not printed when displaying a `DataFrame`:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
xy
11
2a
" + ], + "text/plain": [ + "2×2 DataFrames.DataFrame\n", + "│ Row │ x │ y │\n", + "├─────┼───┼───┤\n", + "│ 1 │ 1 │ │\n", + "│ 2 │ │ a │" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DataFrame(x=[1, nothing], y=[nothing, \"a\"])" + ] } ], "metadata": { "kernelspec": { - "display_name": "Julia 0.6.0", + "display_name": "Julia 0.6.2", "language": "julia", "name": "julia-0.6" }, @@ -1016,7 +1042,7 @@ "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", - "version": "0.6.0" + "version": "0.6.3" } }, "nbformat": 4, diff --git a/02_basicinfo.ipynb b/02_basicinfo.ipynb index d4107a5..a34cece 100644 --- a/02_basicinfo.ipynb +++ b/02_basicinfo.ipynb @@ -5,15 +5,13 @@ "metadata": {}, "source": [ "# Introduction to DataFrames\n", - "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018**" + "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), July 25, 2018**" ] }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "using DataFrames # load package" @@ -122,75 +120,28 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "A\n", - "Summary Stats:\n", - "Mean: 1.500000\n", - "Minimum: 1.000000\n", - "1st Quartile: 1.250000\n", - "Median: 1.500000\n", - "3rd Quartile: 1.750000\n", - "Maximum: 2.000000\n", - "Length: 2\n", - "Type: Int64\n", - "\n", - "B\n", - "Summary Stats:\n", - "Mean: 1.000000\n", - "Minimum: 1.000000\n", - "1st Quartile: 1.000000\n", - "Median: 1.000000\n", - "3rd Quartile: 1.000000\n", - "Maximum: 1.000000\n", - "Length: 2\n", - "Type: Union{Float64, Missings.Missing}\n", - "Number Missing: 1\n", - "% Missing: 50.000000\n", - "\n", - "C\n", - "Summary Stats:\n", - "Length: 2\n", - "Type: String\n", - "Number Unique: 2\n", - "\n" - ] + "data": { + "text/html": [ + "
variablemeanminmedianmaxnuniquenmissingeltype
1A1.511.52Int64
2B1.01.01.01.01Float64
3Cab2String
" + ], + "text/plain": [ + "3×8 DataFrames.DataFrame\n", + "│ Row │ variable │ mean │ min │ median │ max │ nunique │ nmissing │ eltype │\n", + "├─────┼──────────┼──────┼─────┼────────┼─────┼─────────┼──────────┼─────────┤\n", + "│ 1 │ A │ 1.5 │ 1 │ 1.5 │ 2 │ │ │ Int64 │\n", + "│ 2 │ B │ 1.0 │ 1.0 │ 1.0 │ 1.0 │ │ 1 │ Float64 │\n", + "│ 3 │ C │ │ a │ │ b │ 2 │ │ String │" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "describe(x)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use `showcols` to get informaton about columns stored in a DataFrame." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2×3 DataFrames.DataFrame\n", - "│ Col # │ Name │ Eltype │ Missing │ Values │\n", - "├───────┼──────┼──────────────────────────────────┼─────────┼─────────────────┤\n", - "│ 1 │ A │ Int64 │ 0 │ 1 … 2 │\n", - "│ 2 │ B │ Union{Float64, Missings.Missing} │ 1 │ 1.0 … missing │\n", - "│ 3 │ C │ String │ 0 │ a … b │" - ] - } - ], - "source": [ - "showcols(x)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -200,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -212,7 +163,7 @@ " :C" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -221,6 +172,13 @@ "names(x)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Future tip: In Julia 0.7 `propertynames` is also supported." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -230,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -242,7 +200,7 @@ " String " ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -260,10 +218,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, + "execution_count": 8, + "metadata": {}, "outputs": [], "source": [ "y = DataFrame(rand(1:10, 1000, 10));" @@ -278,27 +234,27 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
x1x2x3x4x5x6x7x8x9x10
1861271051510
2896610493109
3514310511059
42922577995
5484108512110
68668333686
" + "
x1x2x3x4x5x6x7x8x9x10
141281075183
2566324910104
33548444669
488267364106
512610477754
657910516234
" ], "text/plain": [ "6×10 DataFrames.DataFrame\n", "│ Row │ x1 │ x2 │ x3 │ x4 │ x5 │ x6 │ x7 │ x8 │ x9 │ x10 │\n", "├─────┼────┼────┼────┼────┼────┼────┼────┼────┼────┼─────┤\n", - "│ 1 │ 8 │ 6 │ 1 │ 2 │ 7 │ 10 │ 5 │ 1 │ 5 │ 10 │\n", - "│ 2 │ 8 │ 9 │ 6 │ 6 │ 10 │ 4 │ 9 │ 3 │ 10 │ 9 │\n", - "│ 3 │ 5 │ 1 │ 4 │ 3 │ 10 │ 5 │ 1 │ 10 │ 5 │ 9 │\n", - "│ 4 │ 2 │ 9 │ 2 │ 2 │ 5 │ 7 │ 7 │ 9 │ 9 │ 5 │\n", - "│ 5 │ 4 │ 8 │ 4 │ 10 │ 8 │ 5 │ 1 │ 2 │ 1 │ 10 │\n", - "│ 6 │ 8 │ 6 │ 6 │ 8 │ 3 │ 3 │ 3 │ 6 │ 8 │ 6 │" + "│ 1 │ 4 │ 1 │ 2 │ 8 │ 10 │ 7 │ 5 │ 1 │ 8 │ 3 │\n", + "│ 2 │ 5 │ 6 │ 6 │ 3 │ 2 │ 4 │ 9 │ 10 │ 10 │ 4 │\n", + "│ 3 │ 3 │ 5 │ 4 │ 8 │ 4 │ 4 │ 4 │ 6 │ 6 │ 9 │\n", + "│ 4 │ 8 │ 8 │ 2 │ 6 │ 7 │ 3 │ 6 │ 4 │ 10 │ 6 │\n", + "│ 5 │ 1 │ 2 │ 6 │ 10 │ 4 │ 7 │ 7 │ 7 │ 5 │ 4 │\n", + "│ 6 │ 5 │ 7 │ 9 │ 10 │ 5 │ 1 │ 6 │ 2 │ 3 │ 4 │" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -316,24 +272,24 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
x1x2x3x4x5x6x7x8x9x10
111057861236
21127973333
34612114794
" + "
x1x2x3x4x5x6x7x8x9x10
12837643854
27155361851
3811094210266
" ], "text/plain": [ "3×10 DataFrames.DataFrame\n", "│ Row │ x1 │ x2 │ x3 │ x4 │ x5 │ x6 │ x7 │ x8 │ x9 │ x10 │\n", "├─────┼────┼────┼────┼────┼────┼────┼────┼────┼────┼─────┤\n", - "│ 1 │ 1 │ 10 │ 5 │ 7 │ 8 │ 6 │ 1 │ 2 │ 3 │ 6 │\n", - "│ 2 │ 1 │ 1 │ 2 │ 7 │ 9 │ 7 │ 3 │ 3 │ 3 │ 3 │\n", - "│ 3 │ 4 │ 6 │ 1 │ 2 │ 1 │ 1 │ 4 │ 7 │ 9 │ 4 │" + "│ 1 │ 2 │ 8 │ 3 │ 7 │ 6 │ 4 │ 3 │ 8 │ 5 │ 4 │\n", + "│ 2 │ 7 │ 1 │ 5 │ 5 │ 3 │ 6 │ 1 │ 8 │ 5 │ 1 │\n", + "│ 3 │ 8 │ 1 │ 10 │ 9 │ 4 │ 2 │ 10 │ 2 │ 6 │ 6 │" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -353,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -362,7 +318,7 @@ "([1, 2], [1, 2], [1, 2])" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -371,6 +327,13 @@ "x[1], x[:A], x[:, 1]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Future tip: In Julia 0.7 also accessing column using `x.A` syntax (`getproperty`/`setproperty!`) is supported." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -380,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -395,7 +358,7 @@ "│ 1 │ 1 │ 1.0 │ a │" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -413,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -422,7 +385,7 @@ "1" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -440,7 +403,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -456,7 +419,7 @@ "│ 2 │ 1 │ 1.0 │ b │" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -475,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -491,7 +454,7 @@ "│ 2 │ 2 │ 2.0 │ b │" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -510,7 +473,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -526,7 +489,7 @@ "│ 2 │ 7 │ 8.0 │ b │" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -539,7 +502,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Julia 0.6.0", + "display_name": "Julia 0.6.2", "language": "julia", "name": "julia-0.6" }, @@ -547,7 +510,7 @@ "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", - "version": "0.6.0" + "version": "0.6.3" } }, "nbformat": 4, diff --git a/03_missingvalues.ipynb b/03_missingvalues.ipynb index 44abc5b..c580a08 100644 --- a/03_missingvalues.ipynb +++ b/03_missingvalues.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "# Introduction to DataFrames\n", - "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018**" + "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), July 25, 2018**" ] }, { @@ -391,7 +391,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Caution: `nothing` would also be replaced here (for Julia 0.7 a more sophisticated behavior of `coalesce` that allows to avoid this problem is planned)." + "Future tip:: `nothing` would also be replaced here (for Julia 0.7 `coalesce` will only handle `missing`)." ] }, { @@ -641,7 +641,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When we call `showcols` on a `DataFrame` with dropped missing values, the columns still allow missing values." + "When we call `describe` on a `DataFrame` with dropped missing values, the columns still allow missing values." ] }, { @@ -650,48 +650,60 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "2×2 DataFrames.DataFrame\n", - "│ Col # │ Name │ Eltype │ Missing │ Values │\n", - "├───────┼──────┼─────────────────────────────────┼─────────┼─────────┤\n", - "│ 1 │ A │ Union{Int64, Missings.Missing} │ 0 │ 1 … 4 │\n", - "│ 2 │ B │ Union{Missings.Missing, String} │ 0 │ A … C │" - ] + "data": { + "text/html": [ + "
variablemeanminmedianmaxnuniquenmissingeltype
1A2.512.540Int64
2BAC20String
" + ], + "text/plain": [ + "2×8 DataFrames.DataFrame\n", + "│ Row │ variable │ mean │ min │ median │ max │ nunique │ nmissing │ eltype │\n", + "├─────┼──────────┼──────┼─────┼────────┼─────┼─────────┼──────────┼────────┤\n", + "│ 1 │ A │ 2.5 │ 1 │ 2.5 │ 4 │ │ 0 │ Int64 │\n", + "│ 2 │ B │ │ A │ │ C │ 2 │ 0 │ String │" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "showcols(x)" + "describe(x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Since we've excluded missing values, we can safely use `disallowmissing!` so that the columns will no longer accept missing values." + "Since we've excluded missing values, we can safely use `disallowmissing!` so that the columns will no longer accept missing values (we can see this as `nmissing` column is empty)." ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "2×2 DataFrames.DataFrame\n", - "│ Col # │ Name │ Eltype │ Missing │ Values │\n", - "├───────┼──────┼────────┼─────────┼─────────┤\n", - "│ 1 │ A │ Int64 │ 0 │ 1 … 4 │\n", - "│ 2 │ B │ String │ 0 │ A … C │" - ] + "data": { + "text/html": [ + "
variablemeanminmedianmaxnuniquenmissingeltype
1A2.512.54Int64
2BAC2String
" + ], + "text/plain": [ + "2×8 DataFrames.DataFrame\n", + "│ Row │ variable │ mean │ min │ median │ max │ nunique │ nmissing │ eltype │\n", + "├─────┼──────────┼──────┼─────┼────────┼─────┼─────────┼──────────┼────────┤\n", + "│ 1 │ A │ 2.5 │ 1 │ 2.5 │ 4 │ │ │ Int64 │\n", + "│ 2 │ B │ │ A │ │ C │ 2 │ │ String │" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "disallowmissing!(x)\n", - "showcols(x)" + "describe(x)" ] } ], @@ -705,7 +717,7 @@ "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", - "version": "0.6.2" + "version": "0.6.3" } }, "nbformat": 4, diff --git a/04_loadsave.ipynb b/04_loadsave.ipynb index b6b1a1e..8ca3145 100644 --- a/04_loadsave.ipynb +++ b/04_loadsave.ipynb @@ -5,15 +5,13 @@ "metadata": {}, "source": [ "# Introduction to DataFrames\n", - "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018**" + "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), July 25, 2018**" ] }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "using DataFrames # load package" @@ -32,9 +30,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "using CSV\n", @@ -131,7 +127,8 @@ " dateformat: nothing\n", " decimal: '.'\n", " truestring: 'true'\n", - " falsestring: 'false', IOBuffer(data=UInt8[...], readable=true, writable=true, seekable=true, append=false, size=0, maxsize=Inf, ptr=1, mark=-1), \"x.csv\", 8, true, String[\"A\", \"B\", \"C\", \"D\"], 4, false, Val{false})" + " falsestring: 'false'\n", + " internstrings: true, IOBuffer(data=UInt8[...], readable=true, writable=true, seekable=true, append=false, size=0, maxsize=Inf, ptr=1, mark=-1), \"x.csv\", 8, true, String[\"A\", \"B\", \"C\", \"D\"], 4, false, Val{false})" ] }, "execution_count": 5, @@ -246,9 +243,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "save(\"x.jld\", \"x\", x)" @@ -338,15 +333,15 @@ "name": "stdout", "output_type": "stream", "text": [ - " 0.782157 seconds (688.90 k allocations: 30.828 MiB, 1.08% gc time)\n", - " 0.018250 seconds (203.61 k allocations: 3.339 MiB)\n" + " 2.529390 seconds (936.79 k allocations: 44.927 MiB, 0.83% gc time)\n", + " 0.018948 seconds (203.62 k allocations: 3.339 MiB)\n" ] }, { "data": { "text/plain": [ "2-element Array{Int64,1}:\n", - " 595307\n", + " 595456\n", " 154487" ] }, @@ -372,9 +367,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "foreach(rm, [\"x.csv\", \"x.jld\", \"bigdf.csv\", \"bigdf.jld\"])" @@ -383,7 +376,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Julia 0.6.0", + "display_name": "Julia 0.6.2", "language": "julia", "name": "julia-0.6" }, @@ -391,7 +384,7 @@ "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", - "version": "0.6.0" + "version": "0.6.3" } }, "nbformat": 4, diff --git a/05_columns.ipynb b/05_columns.ipynb index e9c4ab6..2790a92 100644 --- a/05_columns.ipynb +++ b/05_columns.ipynb @@ -5,15 +5,13 @@ "metadata": {}, "source": [ "# Introduction to DataFrames\n", - "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018**" + "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), July 25, 2018**" ] }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "using DataFrames # load package" @@ -43,7 +41,7 @@ { "data": { "text/html": [ - "
x1x2x3x4
1falsefalsefalsefalse
2falsefalsefalsefalse
3falsefalsefalsefalse
" + "
x1x2x3x4
1falsefalsefalsefalse
2falsefalsefalsefalse
3truetruetruefalse
" ], "text/plain": [ "3×4 DataFrames.DataFrame\n", @@ -51,7 +49,7 @@ "├─────┼───────┼───────┼───────┼───────┤\n", "│ 1 │ false │ false │ false │ false │\n", "│ 2 │ false │ false │ false │ false │\n", - "│ 3 │ false │ false │ false │ false │" + "│ 3 │ true │ true │ true │ false │" ] }, "execution_count": 2, @@ -78,7 +76,7 @@ { "data": { "text/html": [ - "
Ax2x3x4
1falsefalsefalsefalse
2falsefalsefalsefalse
3falsefalsefalsefalse
" + "
Ax2x3x4
1falsefalsefalsefalse
2falsefalsefalsefalse
3truetruetruefalse
" ], "text/plain": [ "3×4 DataFrames.DataFrame\n", @@ -86,7 +84,7 @@ "├─────┼───────┼───────┼───────┼───────┤\n", "│ 1 │ false │ false │ false │ false │\n", "│ 2 │ false │ false │ false │ false │\n", - "│ 3 │ false │ false │ false │ false │" + "│ 3 │ true │ true │ true │ false │" ] }, "execution_count": 3, @@ -115,7 +113,7 @@ { "data": { "text/html": [ - "
x1x1x2x2x3x3x4x4
1falsefalsefalsefalse
2falsefalsefalsefalse
3falsefalsefalsefalse
" + "
x1x1x2x2x3x3x4x4
1falsefalsefalsefalse
2falsefalsefalsefalse
3truetruetruefalse
" ], "text/plain": [ "3×4 DataFrames.DataFrame\n", @@ -123,7 +121,7 @@ "├─────┼───────┼───────┼───────┼───────┤\n", "│ 1 │ false │ false │ false │ false │\n", "│ 2 │ false │ false │ false │ false │\n", - "│ 3 │ false │ false │ false │ false │" + "│ 3 │ true │ true │ true │ false │" ] }, "execution_count": 4, @@ -152,7 +150,7 @@ { "data": { "text/html": [ - "
x1x1x2x2thirdx4x4
1falsefalsefalsefalse
2falsefalsefalsefalse
3falsefalsefalsefalse
" + "
x1x1x2x2thirdx4x4
1falsefalsefalsefalse
2falsefalsefalsefalse
3truetruetruefalse
" ], "text/plain": [ "3×4 DataFrames.DataFrame\n", @@ -160,7 +158,7 @@ "├─────┼───────┼───────┼───────┼───────┤\n", "│ 1 │ false │ false │ false │ false │\n", "│ 2 │ false │ false │ false │ false │\n", - "│ 3 │ false │ false │ false │ false │" + "│ 3 │ true │ true │ true │ false │" ] }, "execution_count": 5, @@ -187,7 +185,7 @@ { "data": { "text/html": [ - "
abcd
1falsefalsefalsefalse
2falsefalsefalsefalse
3falsefalsefalsefalse
" + "
abcd
1falsefalsefalsefalse
2falsefalsefalsefalse
3truetruetruefalse
" ], "text/plain": [ "3×4 DataFrames.DataFrame\n", @@ -195,7 +193,7 @@ "├─────┼───────┼───────┼───────┼───────┤\n", "│ 1 │ false │ false │ false │ false │\n", "│ 2 │ false │ false │ false │ false │\n", - "│ 3 │ false │ false │ false │ false │" + "│ 3 │ true │ true │ true │ false │" ] }, "execution_count": 6, @@ -253,7 +251,7 @@ { "data": { "text/html": [ - "
aa_1a_2a_3
1falsefalsefalsefalse
2falsefalsefalsefalse
3falsefalsefalsefalse
" + "
aa_1a_2a_3
1falsefalsefalsefalse
2falsefalsefalsefalse
3truetruetruefalse
" ], "text/plain": [ "3×4 DataFrames.DataFrame\n", @@ -261,7 +259,7 @@ "├─────┼───────┼───────┼───────┼───────┤\n", "│ 1 │ false │ false │ false │ false │\n", "│ 2 │ false │ false │ false │ false │\n", - "│ 3 │ false │ false │ false │ false │" + "│ 3 │ true │ true │ true │ false │" ] }, "execution_count": 8, @@ -297,7 +295,7 @@ { "data": { "text/html": [ - "
a_1a_3a_2a
1falsefalsefalsefalse
2falsefalsefalsefalse
3falsefalsefalsefalse
" + "
a_1a_3a_2a
1falsefalsefalsefalse
2falsefalsefalsefalse
3truefalsetruetrue
" ], "text/plain": [ "3×4 DataFrames.DataFrame\n", @@ -305,7 +303,7 @@ "├─────┼───────┼───────┼───────┼───────┤\n", "│ 1 │ false │ false │ false │ false │\n", "│ 2 │ false │ false │ false │ false │\n", - "│ 3 │ false │ false │ false │ false │" + "│ 3 │ true │ false │ true │ true │" ] }, "execution_count": 9, @@ -322,7 +320,35 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "also `permutecols!` will be introduced in next release of DataFrames" + "Also `permutecols!` can be used to achieve this in place:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
a_3a_2a_1a
1falsefalsefalsefalse
2falsefalsefalsefalse
3falsetruetruetrue
" + ], + "text/plain": [ + "3×4 DataFrames.DataFrame\n", + "│ Row │ a_3 │ a_2 │ a_1 │ a │\n", + "├─────┼───────┼───────┼───────┼───────┤\n", + "│ 1 │ false │ false │ false │ false │\n", + "│ 2 │ false │ false │ false │ false │\n", + "│ 3 │ false │ true │ true │ true │" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "permutecols!(x, 4:-1:1); x" ] }, { @@ -334,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -351,7 +377,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -369,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -386,7 +412,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -404,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -421,7 +447,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │ 3 │" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -439,7 +465,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -456,7 +482,7 @@ "│ 3 │ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -474,7 +500,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -491,7 +517,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │ 3 │" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -509,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -526,7 +552,7 @@ "│ 3 │ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -544,14 +570,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 20.993 μs (133 allocations: 10.20 KiB)\n" + " 21.460 μs (133 allocations: 10.20 KiB)\n" ] }, { @@ -568,7 +594,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ 3 │ (3, 3) │ (3, 4) │" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -587,7 +613,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -604,7 +630,7 @@ "│ 3 │ 3 │ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -622,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -639,7 +665,7 @@ "│ 3 │ 3 │ 3 │ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -657,7 +683,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -681,7 +707,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ 3 │ (3, 3) │ (3, 4) │" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -699,7 +725,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -716,7 +742,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │ 3 │" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -734,7 +760,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -751,7 +777,7 @@ "│ 3 │ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │ 3 │" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -769,7 +795,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -793,7 +819,7 @@ "│ 3 │ 'c' │ 6 │ 'f' │ 13 │)" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -813,7 +839,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -830,7 +856,7 @@ "│ 3 │ 3 │ 6 │ 'c' │ 'f' │ 13 │" ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -852,7 +878,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -869,7 +895,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │ (3, 5) │" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -887,7 +913,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -904,7 +930,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ (3, 4) │ (3, 5) │" ] }, - "execution_count": 25, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -922,7 +948,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -939,7 +965,7 @@ "│ 3 │ (3, 1) │ (3, 4) │" ] }, - "execution_count": 26, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -957,7 +983,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -974,7 +1000,7 @@ "│ 3 │ (3, 1) │ (3, 3) │ (3, 5) │" ] }, - "execution_count": 27, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -992,7 +1018,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1009,7 +1035,7 @@ "│ 3 │ (3, 1) │" ] }, - "execution_count": 28, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1027,7 +1053,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -1039,7 +1065,7 @@ " (3, 1)" ] }, - "execution_count": 29, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1057,7 +1083,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1069,7 +1095,7 @@ " (3, 1)" ] }, - "execution_count": 30, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1087,7 +1113,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1099,7 +1125,7 @@ "0×0 DataFrames.DataFrame\n" ] }, - "execution_count": 31, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1117,7 +1143,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -1136,7 +1162,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ (3, 4) │ (3, 5) │)" ] }, - "execution_count": 32, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -1155,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1172,7 +1198,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │ (3, 5) │" ] }, - "execution_count": 33, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -1190,7 +1216,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -1207,7 +1233,7 @@ "│ 3 │ (3, 2) │ (3, 2) │ (3, 3) │ (3, 4) │ (3, 5) │" ] }, - "execution_count": 34, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -1226,7 +1252,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -1243,7 +1269,7 @@ "│ 3 │ (3, 2) │ (3, 2) │ (3, 3) │ (3, 4) │ (3, 5) │ 3 │" ] }, - "execution_count": 35, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -1262,7 +1288,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -1279,7 +1305,7 @@ "│ 3 │ (3, 2) │ (3, 2) │ (3, 3) │ (3, 4) │ (3, 5) │ 3 │ 13 │" ] }, - "execution_count": 36, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -1298,7 +1324,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -1315,7 +1341,7 @@ "│ 3 │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │ (3, 5) │" ] }, - "execution_count": 37, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -1333,7 +1359,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -1342,7 +1368,7 @@ "true" ] }, - "execution_count": 38, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -1360,7 +1386,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -1369,7 +1395,7 @@ "2" ] }, - "execution_count": 39, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -1381,7 +1407,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Julia 0.6.0", + "display_name": "Julia 0.6.2", "language": "julia", "name": "julia-0.6" }, @@ -1389,7 +1415,7 @@ "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", - "version": "0.6.0" + "version": "0.6.3" } }, "nbformat": 4, diff --git a/08_joins.ipynb b/08_joins.ipynb index 2d273be..60b83f1 100644 --- a/08_joins.ipynb +++ b/08_joins.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "# Introduction to DataFrames\n", - "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2017**" + "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), July 25, 2017**" ] }, { @@ -417,20 +417,24 @@ { "data": { "text/html": [ - "
id1id2nameid2_1age
111Alice1121
211Alice122
3111Bob1121
4111Bob122
5missingmissingZedmissing99
6missingmissingZed999100
7missing99Zoemissing99
8missing99Zoe999100
" + "
id1id2nameid2_1agesource
111Alice1121both
211Alice122both
3111Bob1121both
4111Bob122both
522Conormissingmissingleft_only
6221Davemissingmissingleft_only
7missingmissingZedmissing99both
8missingmissingZed999100both
9missing99Zoemissing99both
10missing99Zoe999100both
113missingmissing3123right_only
123missingmissing324right_only
" ], "text/plain": [ - "8×5 DataFrames.DataFrame\n", - "│ Row │ id1 │ id2 │ name │ id2_1 │ age │\n", - "├─────┼─────────┼─────────┼───────┼─────────┼─────┤\n", - "│ 1 │ 1 │ 1 │ Alice │ 11 │ 21 │\n", - "│ 2 │ 1 │ 1 │ Alice │ 1 │ 22 │\n", - "│ 3 │ 1 │ 11 │ Bob │ 11 │ 21 │\n", - "│ 4 │ 1 │ 11 │ Bob │ 1 │ 22 │\n", - "│ 5 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ Zed │ \u001b[90mmissing\u001b[39m │ 99 │\n", - "│ 6 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ Zed │ 999 │ 100 │\n", - "│ 7 │ \u001b[90mmissing\u001b[39m │ 99 │ Zoe │ \u001b[90mmissing\u001b[39m │ 99 │\n", - "│ 8 │ \u001b[90mmissing\u001b[39m │ 99 │ Zoe │ 999 │ 100 │" + "12×6 DataFrames.DataFrame\n", + "│ Row │ id1 │ id2 │ name │ id2_1 │ age │ source │\n", + "├─────┼─────────┼─────────┼─────────┼─────────┼─────────┼────────────┤\n", + "│ 1 │ 1 │ 1 │ Alice │ 11 │ 21 │ both │\n", + "│ 2 │ 1 │ 1 │ Alice │ 1 │ 22 │ both │\n", + "│ 3 │ 1 │ 11 │ Bob │ 11 │ 21 │ both │\n", + "│ 4 │ 1 │ 11 │ Bob │ 1 │ 22 │ both │\n", + "│ 5 │ 2 │ 2 │ Conor │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ left_only │\n", + "│ 6 │ 2 │ 21 │ Dave │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ left_only │\n", + "│ 7 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ Zed │ \u001b[90mmissing\u001b[39m │ 99 │ both │\n", + "│ 8 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ Zed │ 999 │ 100 │ both │\n", + "│ 9 │ \u001b[90mmissing\u001b[39m │ 99 │ Zoe │ \u001b[90mmissing\u001b[39m │ 99 │ both │\n", + "│ 10 │ \u001b[90mmissing\u001b[39m │ 99 │ Zoe │ 999 │ 100 │ both │\n", + "│ 11 │ 3 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 31 │ 23 │ right_only │\n", + "│ 12 │ 3 │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ 3 │ 24 │ right_only │" ] }, "execution_count": 13, @@ -439,13 +443,35 @@ } ], "source": [ - "join(x, y, on=[:id1], makeunique=true) # with duplicates all combinations are produced (here :inner join)" + "join(x, y, on=[:id1], makeunique=true, kind=:outer, indicator=:source) # with duplicates all combinations are produced" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, + "outputs": [ + { + "ename": "LoadError", + "evalue": "\u001b[91mArgumentError: Merge key(s) are not unique in both df1 and df2. First duplicate in df1 at 2. First duplicate in df2 at 2\u001b[39m", + "output_type": "error", + "traceback": [ + "\u001b[91mArgumentError: Merge key(s) are not unique in both df1 and df2. First duplicate in df1 at 2. First duplicate in df2 at 2\u001b[39m", + "", + "Stacktrace:", + " [1] \u001b[1m#join#138\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Symbol,1}, ::Symbol, ::Bool, ::Void, ::Tuple{Bool,Bool}, ::Function, ::DataFrames.DataFrame, ::DataFrames.DataFrame\u001b[1m)\u001b[22m\u001b[22m at \u001b[1mD:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\abstractdataframe\\join.jl:327\u001b[22m\u001b[22m", + " [2] \u001b[1m(::Base.#kw##join)\u001b[22m\u001b[22m\u001b[1m(\u001b[22m\u001b[22m::Array{Any,1}, ::Base.#join, ::DataFrames.DataFrame, ::DataFrames.DataFrame\u001b[1m)\u001b[22m\u001b[22m at \u001b[1m.\\:0\u001b[22m\u001b[22m" + ] + } + ], + "source": [ + "join(x, y, on=[:id1], makeunique=true, validate=(true,true)) # you can force validation of uniqueness of key on which you join" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, "outputs": [ { "data": { @@ -462,7 +488,7 @@ "│ 4 │ \u001b[90mmissing\u001b[39m │ 99 │ Zoe │" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -482,7 +508,7 @@ "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", - "version": "0.6.2" + "version": "0.6.3" } }, "nbformat": 4, diff --git a/10_transforms.ipynb b/10_transforms.ipynb index 6bfeb84..8281c3e 100644 --- a/10_transforms.ipynb +++ b/10_transforms.ipynb @@ -562,7 +562,7 @@ "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", - "version": "0.6.2" + "version": "0.6.3" } }, "nbformat": 4, diff --git a/README.md b/README.md index 71e7a3d..9865043 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # An Introduction to DataFrames -[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018 +[Bogumił Kamiński](http://bogumilkaminski.pl/about/), July 25, 2018 A brief introduction to basic usage of [DataFrames](https://github.com/JuliaData/DataFrames.jl). -Tested under Julia 0.6.2, DataFrames 0.11.6, CSV 0.2.4, JLD 0.8.3, Missings 0.2.9, CategoricalArrays 0.3.9, FreqTables 0.2.2, DataFramesMeta 0.3.0, StatPlots 0.7.2. +Tested under Julia 0.6.3, DataFrames 0.11.7, CSV 0.2.5, JLD 0.8.3, Missings 0.2.10, CategoricalArrays 0.3.11, FreqTables 0.2.2, DataFramesMeta 0.3.0, StatPlots 0.7.2. I will try to keep it up to date as the package evolves. This tutorial covers @@ -54,14 +54,15 @@ Changelog: | 2018-05-01 | Added `byrow!` example | | 2018-05-13 | Added `StatPlots` package to extras | | 2018-05-23 | Improved comments in sections 1 do 5 by [Jane Herriman](https://github.com/xorJane) | +| 2018-07-25 | Update to 0.11.7 release | # Core functions summary 1. Constructors: `DataFrame` -2. Getting summary: `size`, `nrow`, `ncol`, `length`, `describe`, `showcols`, `names`, `eltypes`, `head`, `tail` +2. Getting summary: `size`, `nrow`, `ncol`, `length`, `describe`, `names`, `eltypes`, `head`, `tail` 3. Handling missing: `missing` (singleton instance of `Missing`), `ismissing`, `Missings.T`, `skipmissing`, `coalesce`, `allowmissing`, `disallowmissing`, `allowmissing!`, `completecases`, `dropmissing`, `dropmissing!`, disallowmissing, disallowmissing! 4. Loading and saving: `CSV` (package), `JLD` (package), `CSV.read`, `CSV.write`, `save` (from `JLD`), `load` (from `JLD`) -5. Working with columns: `rename`, `rename!`, `names!`, `hcat`, `insert!`, `DataFrames.hcat!`, `merge!`, `delete!`, `empty!`, `categorical!`, `DataFrames.index` +5. Working with columns: `rename`, `rename!`, `names!`, `hcat`, `insert!`, `DataFrames.hcat!`, `merge!`, `delete!`, `empty!`, `categorical!`, `DataFrames.index`, `permutedims!` 6. Working with rows: `sort!`, `sort`, `issorted`, `append!`, `vcat`, `push!`, `view`, `filter`, `filter!`, `deleterows!`, `unique`, `nonunique`, `unique!` 7. Working with categorical: `categorical`, `cut`, `isordered`, `ordered!`, `levels`, `unique`, `levels!`, `droplevels!`, `get`, `recode`, `recode!` 8. Joining: `join` @@ -75,7 +76,3 @@ Changelog: # Changes in DataFrames master since last update of the tutorial -1. Improved rendering of `#undef` in HTML/LaTeX. -2. Added `permutecols!` function. -3. `describe` returns a `DataFrame` -4. On Julia 0.7 you can access columns of `DataFrame` using `.` notation