Add Leaderboard.read_rmses

ph-kev · ph-kev · commit 615ca8284fb3 · 2024-09-20T23:42:36.000Z
This commit adds the functionality of reading RMSEs values from a CSV
file.
diff --git a/NEWS.md b/NEWS.md
@@ -323,6 +323,26 @@ rmse_var = ClimaAnalysis.RMSEVariable(
 A `RMSEVariable` can be inspected using `model_names`, `category_names`, and `rmse_units`
 which provide the model names, the category names, and the units respectively.
 
+#### Reading RMSEs from CSV file
+
+A CSV file containing model names in the first column and root mean squared errors in the
+subsequent columns with a header describing each category (i.e. seasons) can be read into
+a `RMSEVariable`. See the example below on how to use this functionality.
+
+```julia
+rmse_var = ClimaAnalysis.read_rmses("./data/test_csv.csv", "ta")
+rmse_var = ClimaAnalysis.read_rmses(
+    "./data/test_csv.csv",
+    "ta",
+    units = Dict("ACCESS-CM2" => "K", "ACCESS-ESM1-5" => "K"), # passing units as a dictionary
+)
+rmse_var = ClimaAnalysis.read_rmses(
+    "./data/test_csv.csv",
+    "ta",
+    units = "K", # passing units as a string
+)
+```
+
 ## Bug fixes
 
 - Increased the default value for `warp_string` to 72.
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -80,6 +80,7 @@ Leaderboard.RMSEVariable(short_name::String, model_names::Vector{String}, catego
 Leaderboard.model_names
 Leaderboard.category_names
 Leaderboard.rmse_units
+Leaderboard.read_rmses
 ```
 
 ## Utilities
diff --git a/docs/src/data/test_csv.csv b/docs/src/data/test_csv.csv
@@ -0,0 +1,3 @@
+# Model,DJF,MAM,JJA,SON,ANN
+ACCESS-CM2,11.941,10.178,13.279,10.443,8.710
+ACCESS-ESM1-5,15.752,12.477,15.955,12.972,NaN
diff --git a/docs/src/rmse_var.md b/docs/src/rmse_var.md
@@ -63,4 +63,34 @@ which provide the model names, the category names, and the units respectively.
 ClimaAnalysis.model_names(rmse_var)
 ClimaAnalysis.category_names(rmse_var)
 ClimaAnalysis.rmse_units(rmse_var)
+```
+
+## Reading RMSEs from CSV file
+
+Typically, the root mean squared errors (RMSEs) of different models across different
+categories are stored in a different file and need to be loaded in. `ClimaAnalysis` can load
+this information from a CSV file and store it in a `RMSEVariable`. The format of the CSV
+file should have a header consisting of the entry "model_name" (or any other text as it is
+ignored by the function) and rest of the entries should be the category names. Each row
+after the header should start with the model name and the root mean squared errors for each
+category for that model. The entries of the CSV file should be separated by commas.
+
+See the example below using `read_rmses` where data is loaded from `test_csv.csv` and a
+short name of `ta` is provided. One can also pass in a dictionary mapping model names to
+units for `units` or a string if the units are the same for all the models.
+
+```@example rmse_var
+rmse_var = ClimaAnalysis.read_rmses("./data/test_csv.csv", "ta")
+rmse_var = ClimaAnalysis.read_rmses(
+    "./data/test_csv.csv",
+    "ta",
+    units = Dict("ACCESS-CM2" => "K", "ACCESS-ESM1-5" => "K"), # passing units as a dictionary
+)
+rmse_var = ClimaAnalysis.read_rmses(
+    "./data/test_csv.csv",
+    "ta",
+    units = "K", # passing units as a string
+)
+
+nothing # hide
 ```
diff --git a/src/Leaderboard.jl b/src/Leaderboard.jl
@@ -6,7 +6,8 @@ import NaNStatistics: nanmedian
 export RMSEVariable,
     model_names,
     category_names,
-    rmse_units
+    rmse_units,
+    read_rmses
 
 """
     Holding root mean squared errors over multiple categories and models for a single
@@ -236,4 +237,71 @@ Return all the unit of the models in `rmse_var`.
 """
 rmse_units(rmse_var::RMSEVariable) = rmse_var.units
 
+"""
+    read_rmses(csv_file::String, short_name::String; units = nothing)
+
+Read a CSV file and create a RMSEVariable with the `short_name` of the variable.
+
+The format of the CSV file should have a header consisting of the entry "model_name" (or any
+other text as it is ignored by the function) and rest of the entries should be the category
+names. Each row after the header should start with the model name and the root mean squared
+errors for each category for that model. The entries of the CSV file should be separated by
+commas.
+
+The parameter `units` can be a dictionary mapping model name to unit or a string. If `units`
+is a string, then units will be the same across all models. If units is `nothing`, then the
+unit is missing for each model which is denoted by an empty string.
+"""
+function read_rmses(csv_file::String, short_name::String; units = nothing)
+    # Intialize variables we need to construct RMSEVariable
+    model_names = Vector{String}()
+    model_rmse_vec = []
+    category_names = nothing
+    open(csv_file, "r") do io
+        header = readline(io)
+        # Get categories (e.g. DJF, MAM, JJA, SON, ANN)
+        category_names = String.(split(header, ','))
+
+        # get rid of the first column name which is the column named "model_name"
+        category_names |> popfirst!
+
+        # Process each line
+        for (line_num, line) in enumerate(eachline(io))
+            # Split the line by comma
+            fields = split(line, ',')
+
+            # Check if any entry is missing in the CSV file
+            length(fields) != (length(category_names) + 1) &&
+                error("Missing RMSEs for line $(line_num + 1) in CSV file")
+
+            # Grab model name
+            model_name = fields[1]
+
+            # the rest of the row is the rmse for each category
+            model_rmse = map(x -> parse(Float64, x), fields[2:end])
+
+            push!(model_names, model_name)
+            push!(model_rmse_vec, model_rmse)
+        end
+    end
+    model_rmses = stack(model_rmse_vec, dims = 1)
+    isnothing(units) && (
+        units = Dict{valtype(model_names), String}([
+            (model_name, "") for model_name in model_names
+        ])
+    )
+    units isa String && (
+        units = Dict{valtype(model_names), String}([
+            model_name => units for model_name in model_names
+        ])
+    )
+    return RMSEVariable(
+        short_name,
+        model_names,
+        category_names,
+        model_rmses,
+        units,
+    )
+end
+
 end
diff --git a/test/sample_data/test_csv.csv b/test/sample_data/test_csv.csv
@@ -0,0 +1,3 @@
+# Model,DJF,MAM,JJA,SON,ANN
+ACCESS-CM2,11.941,10.178,13.279,10.443,8.710
+ACCESS-ESM1-5,15.752,12.477,15.955,12.972,NaN
diff --git a/test/test_Leaderboard.jl b/test/test_Leaderboard.jl
@@ -127,3 +127,31 @@ import ClimaAnalysis
         Dict("model1" => ""),
     )
 end
+
+@testset "Reading RMSEs from CSV file" begin
+    # Testing constructor using CSV file
+    csv_file_path = joinpath(@__DIR__, "sample_data/test_csv.csv")
+    rmse_var = ClimaAnalysis.read_rmses(csv_file_path, "ta")
+    @test ClimaAnalysis.model_names(rmse_var) == ["ACCESS-CM2", "ACCESS-ESM1-5"]
+    @test ClimaAnalysis.category_names(rmse_var) ==
+          ["DJF", "MAM", "JJA", "SON", "ANN"]
+    @test ClimaAnalysis.rmse_units(rmse_var) ==
+          Dict("ACCESS-CM2" => "", "ACCESS-ESM1-5" => "")
+    @test rmse_var.short_name == "ta"
+    @test rmse_var.RMSEs[1, 1] == 11.941
+    @test isnan(rmse_var.RMSEs[2, 5])
+
+    # Testing constructor using CSV file with units provided
+    rmse_var = ClimaAnalysis.read_rmses(
+        csv_file_path,
+        "ta",
+        units = Dict("ACCESS-ESM1-5" => "m", "wacky" => "weird"),
+    )
+    @test ClimaAnalysis.rmse_units(rmse_var) ==
+          Dict("ACCESS-CM2" => "", "ACCESS-ESM1-5" => "m")
+
+    # Testing constructor using CSV file with units being a string
+    rmse_var = ClimaAnalysis.read_rmses(csv_file_path, "ta", units = "m")
+    @test ClimaAnalysis.rmse_units(rmse_var) ==
+          Dict("ACCESS-CM2" => "m", "ACCESS-ESM1-5" => "m")
+end

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Model,DJF,MAM,JJA,SON,ANN`
	`2`	`+ACCESS-CM2,11.941,10.178,13.279,10.443,8.710`
	`3`	`+ACCESS-ESM1-5,15.752,12.477,15.955,12.972,NaN`