Merge pull request nationalparkservice#59 from RobLBaker/master

addresses bugs in loading core metadata for power BI
RobLBaker · Oct 21, 2024 · 4219725 · 4219725
2 parents fd25e9d + 7a1f70d
commit 4219725
Show file tree

Hide file tree

Showing 9 changed files with 74 additions and 30 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -23,7 +23,6 @@ Remotes:
     nationalparkservice/QCkit
 Imports: 
     EML,
-    sf,
     dplyr,
     httr,
     XML,
@@ -33,21 +32,23 @@ Imports:
     readr,
     magrittr,
     crayon,
-    leaflet,
-    lifecycle,
     EMLeditor (>= 0.1.5),
     DPchecker (>= 0.3.4),
     QCkit (>= 0.1.4),
     here,
     jsonlite,
     cli,
     purrr,
-    tibble
+    tibble,
+    lifecycle
 RoxygenNote: 7.3.2
 Suggests: 
     knitr,
     rmarkdown,
-    testthat (>= 3.0.0)
+    testthat (>= 3.0.0),
+    sf,
+    leaflet,
+    stringr
 VignetteBuilder: knitr
 URL: https://nationalparkservice.github.io/NPSutils/
 BugReports: https://github.com/nationalparkservice/NPSutils/issues
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# NPSutils 0.3.3 (under development)
+
+## 2024-10-21
+  * Bug fixes to `load_data_package()`
+  * Bug fixes to `.get_authors()`
+  * `get_authors` now adds a period (.) after given names with a single character and can handle an unlimited number of given names.
+  * Moved sf, leaflet, and stringr to from imports to suggests.
+
 # NPSutils 0.3.2 "Lost Coast"
   * Add new functions, `load_data_packages()` and `load_data_package()`, which can load data packages (EML in .xml and data in .csv) similarly to the deprecated `load_data_package_deprecated()` function but also allows the data types in the tibbles loaded to be specified based on the information in the metadata.
   * Deprecate `load_data_package()` and rename it to `load_data_package_deprecated()`.

diff --git a/R/load_core_metadata.R b/R/load_core_metadata.R
@@ -5,7 +5,7 @@
 #' 
 #' #' @details The returned dataframe has three columns, EML_element, EML_data and EML_data2. EML_element describes the EML element that was extracted. EML_data and EML_data2 contain the data from that element. In the case of EML_elements with only one piece of data (e.g. the data package title), the data is repeated in the EML_data and EML_data2 columns.  In cases where the element contains two related pieces of data (e.g. author), those items are held in EML_data (e.g. the author's name) and EML_data2 (e.g. the author's email address). 
 #' 
-#' Currently this function is under development and may have issues if an author has more than two givenNames (it will only use the first givenName), an author has not givenNames (only a surName) or an author is an organization and does not have any individualName. If you have a data package with these issues, please contact [[email protected]](mailto:[email protected]).
+#' Currently this function is under development and may have issues if an author is an organization. If you have a data package with these issues, please contact [[email protected]](mailto:[email protected]).
 #' 
 #' The fields that should be returned in the dataframe include: title, publication date, authors (and emails), contacts (and emails), publisher, DOI, publisher city, publisher state, content begin date, content end date, the abstract, notes, "for or by NPS", the license name (e.g. "Public Domain", "CC0"), and a list of each data file in the data package by name.
 #'
@@ -126,7 +126,7 @@ load_core_metadata <- function(ds_ref, path = paste0(getwd(), "/data")){
 #'   
 #' @description `.get_authors()` extracts the "creators" element from EML metadata and returns it as a dataframe with three columsn, first a column indicating that each row is an author. Second, and column with the author's name (first last). Third, the author's email address.
 #' 
-#' @details There are some known issues with this function; unfortunately at this time we do not have example data packages to test them. These include: authors without a givenName, authors with more than two givenNames (e.g. multiple middle names), organizations as authors where there is no individualName.
+#' @details There are some known issues with this function; unfortunately at this time we do not have example data packages to test them. These include: authors without a givenName and organizations as authors where there is no individualName.
 #'
 #' @param metadata an EML formatted R object
 #'
@@ -144,29 +144,40 @@ load_core_metadata <- function(ds_ref, path = paste0(getwd(), "/data")){
   #set up empty dataframe to hold creator info:
   individual <- data.frame(author = as.character(),
                            contact = as.character())
-  for(i in 1:length(seq_along(creators))){
+
+  #if single creator, nest it so that it behaves the same as when there are
+  #multiple creators:
+  if ("organizationName" %in% names(creators) |
+      "individualName" %in% names(creators)) {
+    creators <- list(creators)
+  }
+
+  for (i in 1:length(seq_along(creators))) {
     creator <- unlist(creators[[i]], recursive = FALSE)
     #if there is an individual name:
-    if(!is.null(creator$individualName.surName)){
+    if (!is.null(creator$
+                 individualName.surName)) {
       #if there is a given name:
-      if(!is.null(creator$individualName.givenName)){
-        #if there are two given names (e.g. first and middle)
-        if(length(seq_along(creator$individualName.givenName)) == 2){
-          given <- paste(creator$individualName.givenName[[1]],
-                         creator$individualName.givenName[[2]],
-                         sep = " ")
-          #if there is only one given name (first)
-        } else if(length(seq_along(creator$individualNAme.givenName)) == 1){
-          given <- creator$individualName.givenName
-        } else {
-          #More than 2 given names (e.g. first, middle, middle), use only the first given name:
-          given <- creator$individualName.givenName[[1]]
+      if (!is.null(creator$individualName.givenName)) {
+        given <- NULL
+        for (i in 1:length(seq_along(creator$individualName.givenName))) {
+          if (nchar(creator$individualName.givenName[[i]]) == 1) {
+            given <- paste0(given,
+                            paste0(creator$individualName.givenName[[i]],
+                            ". "))
+          } else {
+              given <- paste0(given,
+                              paste0(creator$individualName.given[[i]],
+                                     " "))
+          }
         }
-
       } else {
         #if there is no given name:
         given <- NA
       }
+      #get rid of extra whitespaces and trailing whitespaces:
+      given <- stringr::str_squish(given)
+
       #get last name
       sur <- creator$individualName.surName
       #generate full name as first (first) last

diff --git a/R/load_data_packages.R b/R/load_data_packages.R
@@ -105,11 +105,21 @@ load_data_packages <- function(reference_id,
                     nom4 <- nom3[["codeDefinition"]]
                     #get factors
                     factors <- NULL
+                    #if (length(seq_along(nom4)) > 1) {
+                      #nom4 <- unlist(nom4, recursive = FALSE)
+                    #}
                     #handle case where there is only one code definition
                     if ("code" %in% names(nom4)) {
                       nom4 <- list(nom4)
                     }
+                    # for(k in 1:length(seq_along(nom4))) {
+                    #  if("code" %in% names(nom4[k])) {
+                    #    factors <- append(factors, nom5[[k]])
+                    #  }
+                    #}
+
                     for (k in 1:length(seq_along(nom4))) {
+                      #print(paste0("i=",i, ", j=", j, " k=, ", k, "."))
                       factors <- append(factors, nom4[[k]][["code"]])
                     }
                     #set column type:
@@ -159,9 +169,9 @@ load_data_package <- function(reference_id,
                               simplify = TRUE) {
 
   x <- load_data_packages(reference_id, 
-                          directory = here::here("data"),
-                          assign_attributes = FALSE,
-                          simplify = TRUE)
+                          directory,
+                          assign_attributes,
+                          simplify)
   return(x)
 }
 

diff --git a/R/load_pgk_metadata.R b/R/load_pgk_metadata.R
@@ -51,9 +51,10 @@ load_pkg_metadata <- function(holding_id, directory = here::here("data")) {
 
   #load metadata
   eml_object <- EML::read_eml(meta_location, from = "xml")
-    attributeList <- EML::get_attributes(workingEMLfile$dataset$dataTable$attributeList)
-    attributes <- attributeList$attributes
-    factors <- attributeList$factors
+    #attributeList <- EML::get_attributes(eml_object)
+    attribute_list <- eml_object$dataset$dataTable$attributeList
+    attributes <- attribute_list$attributes
+    factors <- attribute_list$factors
 
     # Figure out column classes based on attribute table (character, numeric, integer, logical, or complex)
     attributes$columnclass <- "character"

diff --git a/docs/news/index.html b/docs/news/index.html
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
@@ -3,4 +3,4 @@ pkgdown: 2.1.0
 pkgdown_sha: ~
 articles:
   NPSutils: NPSutils.html
-last_built: 2024-10-02T17:03Z
+last_built: 2024-10-21T15:46Z
diff --git a/docs/reference/Rplot001.png b/docs/reference/Rplot001.png
diff --git a/man/load_core_metadata.Rd b/man/load_core_metadata.Rd