Fix identifying variables with _ in the name

Sbozzolo · Sbozzolo · commit 26bf2a1de6d6 · 2024-10-03T11:17:53.000-07:00
The regular expression used by ClimaAnalysis was not identifying
correctly variables with an underscore in the short name. The reason for
this was that the time interval capturing group was too greedy. I
changed the capturing group to only match with m|M|d|y|s|min, allowing
me to capture more general short names.
diff --git a/NEWS.md b/NEWS.md
@@ -24,6 +24,9 @@ dimension.
 
 - Interpolation is not possible with dates. When dates are detected in any dimension, an
   interpolat will not be made.
+- Fix identifying variables with underscore in the short name (such as
+  `net_toa_flux`). ([#109](https://github.com/CliMA/ClimaAnalysis.jl/pull/109
+  "PR109"))
 
 v0.5.9
 ------
@@ -58,11 +61,11 @@ julia> reordered_var.dims |> keys |> collect
 ## Bug fixes
 
 - Fix models repeating in legend of box plots by not considering the models in `model_names`
-  when finding the best and worst models
+  when finding the best and worst models.
 - Fix legend from covering the box plot by adding the parameter `legend_text_width` which
-  control the number of characters on each line of the legend of the box plot
+  control the number of characters on each line of the legend of the box plot.
 - Use default marker size instead of a marker size of 20 when plotting other models beside
-  `CliMA` on the box plot
+  `CliMA` on the box plot.
 - Fix support for `""` in units.
 
 v0.5.8
diff --git a/src/Utils.jl b/src/Utils.jl
@@ -26,8 +26,8 @@ julia> match_nc_filename("ta_1d_average.nc")
 ```
 
 ```jldoctest
-julia> match_nc_filename("pfull_6.0min_max.nc")
-("pfull", "6.0min", "max")
+julia> match_nc_filename("pfull_6.0m_max.nc")
+("pfull", "6.0m", "max")
 ```
 
 ```jldoctest
@@ -39,23 +39,36 @@ function match_nc_filename(filename::String)
     # Let's unpack this regular expression to find files names like "orog_inst.nc" or
     # "ta_3.0h_average.nc" and extract information from there.
 
-    # ^ $: mean match the entire string
-    # (\w+?): the first capturing group, matching any word non greedily
-    # _: matches this literal character
-    # (?>([a-zA-Z0-9\.]*)_)?: an optional group (it doesn't always exist for _inst
-    #                         variables) ?> means that we don't want to capture the outside
-    #                         group the inside group is any combinations of letters/numbers,
-    #                         and the literal character ., followed by the _. We capture the
-    #                         combination of characters because that's the reduction
-    # (\w+): Again, any word
-    # \.nc: file extension has to be .nc
-    re = r"^(\w+?)_(?>([a-zA-Z0-9_\.]*)_)?(\w*)\.nc$"
+    # ^: Matches the beginning of the string
+
+    # (\w+?): Matches one or more word characters (letters, numbers, or underscore)
+    # non-greedily and captures it as the first group (variable name)
+
+    # _: Matches the underscore separating the variable name and the optional time
+    # resolution.
+
+    # ((?:[0-9]|m|M|d|s|y|_|\.)*?): Matches zero or more occurrences of the allowed
+    # characters (digits, time units, underscore, or dot) non-greedily and captures the
+    # entire time resolution string as the second group
+
+    # _?: Matches an optional underscore (to handle cases where there's no time resolution)
+
+    # ([a-zA-Z0-9]+): Matches one or more alphanumeric characters and captures it as the
+    # third group (statistic)
+
+    # \.nc: Matches the literal ".nc" file extension
+
+    # $: Matches the end of the string
+
+    re = r"^(\w+?)_((?:[0-9]|m|M|d|s|y|h|_|\.)*?)_?([a-zA-Z0-9]+)\.nc$"
     m = match(re, filename)
     if !isnothing(m)
         # m.captures returns `SubString`s (or nothing). We want to have actual `String`s (or
-        # nothing) so that we can assume we have `String`s everywhere.
+        # nothing) so that we can assume we have `String`s everywhere. We also take care of
+        # the case where the period is matched to an empty string and return nothing instead
         return Tuple(
-            isnothing(cap) ? nothing : String(cap) for cap in m.captures
+            (isnothing(cap) || cap == "") ? nothing : String(cap) for
+            cap in m.captures
         )
     else
         return nothing
diff --git a/test/test_Utils.jl b/test/test_Utils.jl
@@ -9,11 +9,20 @@ import Dates
     @test Utils.match_nc_filename("ta_1d_average.nc") ==
           Tuple(["ta", "1d", "average"])
 
-    @test Utils.match_nc_filename("ta_1m_40s_inst.nc") ==
-          Tuple(["ta", "1m_40s", "inst"])
+    @test Utils.match_nc_filename("ta_3.0h_average.nc") ==
+          Tuple(["ta", "3.0h", "average"])
 
-    @test Utils.match_nc_filename("pfull_6.0min_max.nc") ==
-          Tuple(["pfull", "6.0min", "max"])
+    @test Utils.match_nc_filename("toa_net_flux_1m_40s_inst.nc") ==
+          Tuple(["toa_net_flux", "1m_40s", "inst"])
+
+    @test Utils.match_nc_filename("toa_net_flux_1M_inst.nc") ==
+          Tuple(["toa_net_flux", "1M", "inst"])
+
+    @test Utils.match_nc_filename("p500_1M_inst.nc") ==
+          Tuple(["p500", "1M", "inst"])
+
+    @test Utils.match_nc_filename("pfull_6.0m_max.nc") ==
+          Tuple(["pfull", "6.0m", "max"])
 
     @test Utils.match_nc_filename("hu_inst.nc") ==
           Tuple(["hu", nothing, "inst"])