Closed
Description
I removed this from readme because 1. it uses outdated syntax 2. maxOrNull doesn't work now 3. it looks out of place, there's a lot of content before it
Create:
// create columns
val fromTo by columnOf("LoNDon_paris", "MAdrid_miLAN", "londON_StockhOlm", "Budapest_PaRis", "Brussels_londOn")
val flightNumber by columnOf(10045.0, Double.NaN, 10065.0, Double.NaN, 10085.0)
val recentDelays by columnOf("23,47", null, "24, 43, 87", "13", "67, 32")
val airline by columnOf("KLM(!)", "{Air France} (12)", "(British Airways. )", "12. Air France", "'Swiss Air'")
// create dataframe
val df = dataFrameOf(fromTo, flightNumber, recentDelays, airline)
// print dataframe
df.print()
Clean:
// typed accessors for columns
// that will appear during
// dataframe transformation
val origin by column<String>()
val destination by column<String>()
val clean = df
// fill missing flight numbers
.fillNA { flightNumber }.with { prev()!!.flightNumber + 10 }
// convert flight numbers to int
.convert { flightNumber }.toInt()
// clean 'airline' column
.update { airline }.with { "([a-zA-Z\\s]+)".toRegex().find(it)?.value ?: "" }
// split 'fromTo' column into 'origin' and 'destination'
.split { fromTo }.by("_").into(origin, destination)
// clean 'origin' and 'destination' columns
.update { origin and destination }.with { it.lowercase().replaceFirstChar(Char::uppercase) }
// split lists of delays in 'recentDelays' into separate columns
// 'delay1', 'delay2'... and nest them inside original column `recentDelays`
.split { recentDelays }.inward { "delay$it" }
// convert string values in `delay1`, `delay2` into ints
.parse { recentDelays }
Aggregate:
clean
// group by the flight origin renamed into "from"
.groupBy { origin named "from" }.aggregate {
// we are in the context of a single data group
// total number of flights from origin
count() into "count"
// list of flight numbers
flightNumber into "flight numbers"
// counts of flights per airline
airline.valueCounts() into "airlines"
// max delay across all delays in `delay1` and `delay2`
recentDelays.maxOrNull { delay1 and delay2 } into "major delay"
// separate lists of recent delays for `delay1`, `delay2` and `delay3`
recentDelays.implode(dropNA = true) into "recent delays"
// total delay per destination
pivot { destination }.sum { recentDelays.colsOf<Int?>() } into "total delays to"
}
Check it out on Datalore to get a better visual impression of what happens and what the hierarchical dataframe structure looks like.
Metadata
Metadata
Assignees
Labels
No labels