diff --git a/DESCRIPTION b/DESCRIPTION index 773163a..ff3164e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,7 +40,7 @@ URL: https://CRAN.R-project.org/package=phonfieldwork, https://docs.ropensci.org BugReports: https://github.com/ropensci/phonfieldwork/issues Encoding: UTF-8 LazyData: true -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.2 VignetteBuilder: knitr Suggests: knitr, diff --git a/NAMESPACE b/NAMESPACE index 00fb98a..94017c1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(create_glossed_document) export(create_presentation) export(create_subannotation) export(create_viewer) +export(df_to_eaf) export(df_to_tier) export(draw_sound) export(draw_spectrogram) @@ -41,6 +42,7 @@ importFrom(graphics,points) importFrom(graphics,rect) importFrom(graphics,segments) importFrom(graphics,text) +importFrom(mime,guess_type) importFrom(rmarkdown,render) importFrom(stats,fft) importFrom(stats,sd) diff --git a/R/df_to_eaf.R b/R/df_to_eaf.R new file mode 100644 index 0000000..22a5978 --- /dev/null +++ b/R/df_to_eaf.R @@ -0,0 +1,478 @@ +#' Dataframe to .eaf +#' +#' Convert a dataframe to Elan file .exb +#' +#' @author Sergej Kudrjashov +#' +#' @param df an R dataframe object that contains columns named 'tier', 'id', 'tier_name', +#' 'content', 'time_start', 'time_end' and preferably also 'tier_type', 'stereotype', +#' 'tier_ref', 'event_local_id', 'dependent_on' that are specific for eaf file +#' @param output_file the name of the result .eaf file +#' @param output_dir the output directory for the rendered file (defalut is used if not spectified) +#' @param ref_file a filepath for connected media file (not obligatory) - if specified, overwrites existing parameters +#' @param mime_type a MIME type of connected media file (not obligatory) - if specified, overwrites existing parameters +#' @return .xml file +#' @examples +#' +#' df <- eaf_to_df(system.file("extdata", "test.eaf", package = "phonfieldwork")) +#' +#' df_to_eaf(df = df, +#' output_file = 'test.eaf', +#' ref_file = 'test.wav') +#' +#' @importFrom mime guess_type +#' @export +#' + +df_to_eaf <- function(df, output_file, output_dir = '', ref_file = '', mime_type = '') { + + #--- main body (maybe add date) + my_eaf <- paste( + '', + '', + '%s', #header + '%s', #time slots + '%s', #tiers + '%s', #tier types + '\t', + '\t', + '\t', + '\t', + '', + sep = '\n') + + + #--- header + header <- paste( + '\t
', + '\t\t', + '\t\turn:nl-mpi-tools-elan-eaf:e7d15769-9e52-4663-aa66-5033ddad8142', + '\t\t', + '\t
', + sep = '\n') + + header_v2 <- paste( + '\t
', + '\t\turn:nl-mpi-tools-elan-eaf:e7d15769-9e52-4663-aa66-5033ddad8142', + '\t\t', + '\t
', + sep = '\n') + + #--- time order + time_slots <- paste( + '\t', + '%s', + '\t', + sep = '\n') + + slot <- paste( + '\t\t', + sep = '\n') + + #--- tiers + indep_tier <- paste( + '\t', + "%s", #annotations + '\t', + sep = '\n') + + dep_tier <- paste( + '\t', + "%s", #annotations + '\t', + sep = '\n') + + #--- annotations + align_annotation <- paste( + '\t\t', + '\t\t\t', + '\t\t\t\t%s', + '\t\t\t', + '\t\t', + sep = '\n') + + ref_annotation <- paste( + '\t\t', + '\t\t\t', + '\t\t\t\t%s', + '\t\t\t', + '\t\t', + sep = '\n') + + spec_ref_annotation <- paste( + '\t\t', + '\t\t\t' , #for symb.subdivision layer > 1 + '\t\t\t\t%s', + '\t\t\t', + '\t\t', + sep = '\n') + + #--- linguistic types + indep_type <- paste( + '\t', + sep = '\n') + + dep_type <- paste( + '\t', + sep = '\n') + + if (is.na(ref_file)) { + ref_file = '' + } + + #--- get columns + wanted_columns <- c('tier', 'id', 'content', 'tier_name', 'time_start', 'time_end') + + if (!all(wanted_columns %in% colnames(df))) { + + stop(paste('Missing columns. Annotations need to contain: ', paste(wanted_columns, collapse = ", ", sep=""))) + + } + + bool_tier_type <- FALSE + bool_tier_stereotype <- FALSE + bool_tier_ref <- FALSE + bool_tier_event_id <- FALSE + bool_dependent_on <- FALSE + + columns <- colnames(df) + if ('tier_type' %in% columns) { + bool_tier_type <- TRUE + wanted_columns <- c(wanted_columns, 'tier_type') + } + + if ('stereotype' %in% columns) { + if (!FALSE %in% (unique(df$stereotype) %in% c(NA, "Symbolic_Subdivision", + "Symbolic_Association", "Included_In", "Time_Subdivision"))) { + bool_tier_stereotype<- TRUE + wanted_columns <- c(wanted_columns, 'stereotype') + } else { + stop(paste('Unknown stereotype. Make sure the stereotype is one of: ', + paste(NA, "Symbolic_Subdivision", + "Symbolic_Association", "Included_In", "Time_Subdivision", sep = ', '))) + } + } + + if ('tier_ref' %in% columns) { + if (!FALSE %in% unique(na.omit(df$tier_ref)) %in% df$tier_name) { + bool_tier_ref <- TRUE + wanted_columns <- c(wanted_columns, 'tier_ref') + } else { + stop(paste('Some dependent tiers match none of local ids: ', + paste(unique(na.omit(df$tier_ref)[!na.omit(df$tier_ref) %in% df$tier_name]), collapse = ', '))) + } + } + + if ('event_local_id' %in% columns) { + bool_tier_event_id<- TRUE + wanted_columns <- c(wanted_columns, 'event_local_id') + + if ('dependent_on' %in% columns) { + if (!FALSE %in% unique(na.omit(df$dependent_on)) %in% df$event_local_id) { + bool_dependent_on <- TRUE + wanted_columns <- c(wanted_columns, 'dependent_on') + } else { + stop(paste('Some dependent events match none of local ids: ', + paste(na.omit(df$dependent_on)[!na.omit(df$dependent_on) %in% df$event_local_id], collapse = ', '))) + } + } + } + + if (ref_file == '') { + if (sum(!is.na(df$MEDIA_URL)) == 0) { + warning(paste('MEDIA_URL not specialized. Writing with no file connected')) + relative_ref_file <- '' + } else { + ref_file <- unique(df$MEDIA_URL, drop.na = T)[1] + relative_ref_file <- paste0('./',gsub(".+/", "", ref_file)) + } + } else { + ref_file <- paste0('file:///', ref_file) + relative_ref_file <- paste0('./',gsub(".+/", "", ref_file)) + } + + #--- external package function used + if (mime_type == '') { + if (sum(!is.na(df$MIME_TYPE)) == 0) { + warning(paste('MIME_TYPE not specialized. This may cause some problems connecting mediafile')) + if (ref_file != '') { + paste('Guessing MIME TYPE based on provided media file...') + mime_type <- mime::guess_type(ref_file) + } else { + mime_type = '' + } + } else { + mime_type <- unique(df$MIME_TYPE, drop.na = T)[1] + } + } + + #--- create table for endfile + table <- df[wanted_columns] + + if (!bool_tier_stereotype & !bool_tier_type & bool_tier_ref){ + stop(paste('Tier types and stereotypes specification is needed in order to write.')) + } + + if (!bool_tier_type) { + warning(paste('Tier types not specified. Writing with independent default tier values')) + table$tier_type <- rep(c('default'), length(table$content)) + table$stereotype <- rep(NA, length(table$content)) + table$tier_ref <- rep(NA, length(table$content)) + table$event_local_id <- rep(NA, length(table$content)) + table$dependent_on <- rep(NA, length(table$content)) + } + + if (!bool_tier_stereotype){ + warning(paste('Tier stereotypes not specified. Writing as independent tiers')) + table$stereotype <- rep(NA, length(table$content)) + table$tier_ref <- rep(NA, length(table$content)) + table$event_local_id <- rep(NA, length(table$content)) + table$dependent_on <- rep(NA, length(table$content)) + } + + #--- collect times + if (sum(is.na(table$time_start)) + sum(is.na(table$time_end)) != 0) { + warning(paste('Missing timestamps in rows: ', + paste(which(is.na(table$time_end)), collapse = " ", sep=""), + paste(which(is.na(table$time_start)), collapse = " ", sep=""), + 'They will be skipped.')) + bad_rows <- c(which(is.na(table$time_start)), which(is.na(table$time_end))) + table <- table[-bad_rows,] + } + + + + #--- getting rid of irrelevant timecodes + time_table <- subset(table, subset = table$stereotype %in% c(NA, 'Included_In')) + allTimes <- c(time_table$time_start, time_table$time_end) + + time_table <- subset(table, subset = table$stereotype %in% c('Time_Subdivision')) + allTimes_Subdiv <- c(time_table$time_start, time_table$time_end) + allTimes <- c(allTimes, setdiff(allTimes_Subdiv, allTimes)) + allTimes <- allTimes[order(allTimes)] + allTimes <- as.data.frame(cbind(ts=paste("ts", 1:length(allTimes),sep=""), + value=as.character(allTimes)), + stringsAsFactors=FALSE) + + #--- acquiring symbolic connections + symb_connections <- sapply(row.names(table[which(table$stereotype %in% + c('Symbolic_Subdivision', 'Symbolic_Association')), ]), function(i){ + event_ref <- table[i, 11] + row.names(table[which(table$event_local_id == event_ref), ]) + }) + + #--- setting up new symbolic connections + new_event_codes <- c(paste0('a', seq_along(along.with = row.names(table)))) + table$event_local_id <- new_event_codes + loc_table <- table[which(table$stereotype %in% + c('Symbolic_Subdivision', 'Symbolic_Association')), ] + ref_event <- sapply(row.names(loc_table), function(i) { + row_ref <- symb_connections[i] + table[row_ref, 10] + }) + loc_table$dependent_on <- ref_event + table[row.names(table) %in% row.names(loc_table), ] <- loc_table + + #--- acquiring tier_type stereotypes + stereotable <- table[c('tier_type', 'stereotype')] + stereotable <- unique(stereotable) + + + #--- fill head + if (ref_file == '') { + header <- sprintf(header_v2) + } else { + header <- sprintf(header, ref_file, mime_type, relative_ref_file) + } + + + #--- fill timecodes + slot <- sprintf(slot, allTimes$ts, format(as.numeric(allTimes$value)*1000, scientific = FALSE, trim = TRUE)) + slot <- paste(slot, collapse="\n") + time_slots <- sprintf(time_slots, slot) + time_slots <- paste(time_slots, collapse="\n") + + #--- fill tiers + tiers_vec <- c(rep_len(NA, length.out = length(unique(table$tier)))) + table$ts_start <- NA + table$ts_end <- NA + table <- rbind(subset(table, subset = !(table$stereotype %in% c('Time_Subdivision'))), + subset(table, subset = (table$stereotype %in% c('Time_Subdivision')))) + + cur.env <- new.env() + cur.env$allTimes <- allTimes + cur.env$table <- table + cur.env$tiers_vec <- tiers_vec + cur.env$prev_symb_value <- '' + + tiers <- sapply(unique(cur.env$table$tier), function(i) { + cur_tier <- cur.env$table[which(cur.env$table$tier == i), ] + + #--- indep tiers + if (is.na(cur_tier[1, ]$stereotype)) { + + tier <- sapply(order(cur_tier$id), function(j) { + cur <- cur_tier[which(cur_tier$id == j), ] + ts_start <- cur.env$allTimes[which(cur.env$allTimes$value == cur$time_start), 1][1] + ts_end <- cur.env$allTimes[which(cur.env$allTimes$value == cur$time_end), 1][1] + cur.env$allTimes <- cur.env$allTimes[-c(which(cur.env$allTimes$ts %in% c(ts_start, ts_end))), ] + cur$ts_start <- ts_start + cur$ts_end <- ts_end + cur.env$table[row.names(cur.env$table) %in% row.names(cur), ] <- cur + sprintf(align_annotation, cur$event_local_id, cur$ts_start, cur$ts_end, cur$content) + }) + tier <- paste(tier, collapse = '\n') + + tier_paste <- sprintf(indep_tier, cur_tier[1, ]$tier_type, cur_tier[1, ]$tier_name, + tier) + cur.env$tiers_vec[cur_tier[1, ]$tier] <- tier_paste + } + + #--- Included in stereotype + if (cur_tier[1, ]$stereotype %in% c('Included_In')) { + + tier <- sapply(order(cur_tier$id), function(j) { + cur <- cur_tier[which(cur_tier$id == j), ] + ts_start <- cur.env$allTimes[which(cur.env$allTimes$value == cur$time_start), 1][1] + ts_end <- cur.env$allTimes[which(cur.env$allTimes$value == cur$time_end), 1][1] + cur.env$allTimes <- cur.env$allTimes[-c(which(cur.env$allTimes$ts %in% c(ts_start, ts_end))), ] + cur$ts_start <- ts_start + cur$ts_end <- ts_end + cur.env$table[row.names(cur.env$table) %in% row.names(cur), ] <- cur + sprintf(align_annotation, cur$event_local_id, cur$ts_start, cur$ts_end, cur$content) + }) + tier <- paste(tier, collapse = '\n') + + tier_paste <- sprintf(dep_tier, cur_tier[1, ]$tier_type, cur_tier[1, ]$tier_ref, + cur_tier[1, ]$tier_name, tier) + cur.env$tiers_vec[cur_tier[1, ]$tier] <- tier_paste + } + + #--- Time Subdiv. stereotype + if (cur_tier[1, ]$stereotype %in% c('Time_Subdivision')) { + + refer_tier <- cur.env$table[which(cur.env$table$tier_name == cur_tier[1, ]$tier_ref), ] + + tier <- sapply(order(cur_tier$id), function(j) { + cur <- cur_tier[which(cur_tier$id == j), ] + + if (!cur$time_start %in% cur.env$allTimes$value) { + ts_start <- refer_tier[which(refer_tier$time_start == cur$time_start), ]$ts_start + } else { + ts_start <- cur.env$allTimes[which(cur.env$allTimes$value == cur$time_start), 1][1] + } + + if (!cur$time_end %in% cur.env$allTimes$value) { + ts_end <- refer_tier[which(refer_tier$time_end == cur$time_end), ]$ts_end + } else { + ts_end <- cur.env$allTimes[which(cur.env$allTimes$value == cur$time_end), 1][1] + } + + cur$ts_start <- ts_start + cur$ts_end <- ts_end + + cur.env$table[row.names(cur.env$table) %in% row.names(cur), ] <- cur + sprintf(align_annotation, cur$event_local_id, cur$ts_start, cur$ts_end, cur$content) + }) + tier <- paste(tier, collapse = '\n') + + tier_paste <- sprintf(dep_tier, cur_tier[1, ]$tier_type, cur_tier[1, ]$tier_ref, + cur_tier[1, ]$tier_name, tier) + cur.env$tiers_vec[cur_tier[1, ]$tier] <- tier_paste + } + + #--- Symb. Assoc. Stereotype + if (cur_tier[1, ]$stereotype %in% c('Symbolic_Association')) { + + tier <- sapply(order(cur_tier$id), function(j) { + cur <- cur_tier[which(cur_tier$id == j), ] + + sprintf(ref_annotation, cur$event_local_id, cur$dependent_on, cur$content) + + }) + tier <- paste(tier, collapse = '\n') + + tier_paste <- sprintf(dep_tier, cur_tier[1, ]$tier_type, cur_tier[1, ]$tier_ref, + cur_tier[1, ]$tier_name, tier) + cur.env$tiers_vec[cur_tier[1, ]$tier] <- tier_paste + } + + #--- Symb. Subdivision Stereotype + if (cur_tier[1, ]$stereotype %in% c('Symbolic_Subdivision')) { + + cur.env$cur_dep = '' + + tier <- sapply(order(cur_tier$id), function(j) { + cur <- cur_tier[which(cur_tier$id == j), ] + + if (j == 1) { + loc <- sprintf(ref_annotation, cur$event_local_id, cur$dependent_on, cur$content) + cur.env$prev_symb_value <- cur$event_local_id + cur.env$cur_dep <- cur$dependent_on + } else { + if (cur.env$cur_dep != cur$dependent_on) { + loc <- sprintf(ref_annotation, cur$event_local_id, cur$dependent_on, cur$content) + cur.env$prev_symb_value <- cur$event_local_id + cur.env$cur_dep <- cur$dependent_on + } else { + loc <- sprintf(spec_ref_annotation, cur$event_local_id, cur$dependent_on, + cur.env$prev_symb_value, cur$content) + cur.env$prev_symb_value <- cur$event_local_id + cur.env$cur_dep <- cur$dependent_on + } + } + loc + }) + tier <- paste(tier, collapse = '\n') + + tier_paste <- sprintf(dep_tier, cur_tier[1, ]$tier_type, cur_tier[1, ]$tier_ref, + cur_tier[1, ]$tier_name, tier) + cur.env$tiers_vec[cur_tier[1, ]$tier] <- tier_paste + } + + }) + + tiers <- paste(cur.env$tiers_vec, collapse = '\n') + + + #--- fill tier type info + ling_type <- sapply(row.names(stereotable), function(i){ + time_alignable <- 'true' + if (is.na(stereotable[i, 2])) { + sprintf(indep_type, stereotable[i, 1]) + } else { + if (stereotable[i, 2] %in% c('Symbolic_Subdivision', 'Symbolic_Association')) { + time_alignable <- 'false' + } + sprintf(dep_type, stereotable[i, 2], stereotable[i, 1], time_alignable) + } + }) + ling_type <- paste(ling_type, collapse = '\n') + + #---fill the main body + myEAF <- sprintf(my_eaf, header, time_slots, tiers, ling_type) + + if (output_dir != '') { + path <- normalizePath(paste(output_dir, output_file, sep = '/')) + } else { + output_dir <- getwd() + path <- normalizePath(paste(output_dir, output_file, sep = '/')) + } + + fileConn <- file(path, open="wb") + writeBin(charToRaw(myEAF), fileConn, endian="little") + close(fileConn) +} diff --git a/R/eaf_to_df.R b/R/eaf_to_df.R index b54ab03..7488282 100644 --- a/R/eaf_to_df.R +++ b/R/eaf_to_df.R @@ -1,115 +1,217 @@ -#' ELAN's .eaf file to dataframe -#' -#' Convert .eaf file from ELAN to a dataframe. -#' -#' @author George Moroz -#' -#' @param file_name string with a filename or path to the .eaf file -#' @return a dataframe with columns: \code{tier}, \code{id}, \code{content}, -#' \code{tier_name}, \code{tier_type}, \code{time_start}, \code{time_end}, -#' \code{source}). -#' -#' @examples -#' eaf_to_df(system.file("extdata", "test.eaf", package = "phonfieldwork")) -#' @export -#' @importFrom xml2 read_xml -#' @importFrom xml2 xml_find_all -#' @importFrom xml2 xml_attr -#' @importFrom xml2 xml_text -#' @importFrom xml2 xml_children - -eaf_to_df <- function(file_name) { - # read file - l <- xml2::read_xml(file_name) - # extract tiers - t <- xml2::xml_find_all(l, "TIER") - # extract tiers - tier_names <- xml2::xml_attr(t, "TIER_ID") - # tier types - tier_types <- xml2::xml_attr(t, "LINGUISTIC_TYPE_REF") - - # create list of dataframes - r <- lapply(seq_along(t), function(i) { - content <- xml2::xml_text(xml2::xml_find_all( - t[[i]], - "ANNOTATION/*/ANNOTATION_VALUE" - )) - ts1 <- xml2::xml_attr( - xml2::xml_children(xml2::xml_children(t[[i]])), - "TIME_SLOT_REF1" - ) - ts2 <- xml2::xml_attr( - xml2::xml_children(xml2::xml_children(t[[i]])), - "TIME_SLOT_REF2" - ) - a_id <- xml2::xml_attr( - xml2::xml_children(xml2::xml_children(t[[i]])), - "ANNOTATION_ID" - ) - ar <- xml2::xml_attr( - xml2::xml_children(xml2::xml_children(t[[i]])), - "ANNOTATION_REF" - ) - if (length(content) > 0) { - data.frame( - tier = i, - id = seq_along(content), - content = content, - tier_name = tier_names[i], - tier_type = tier_types[i], - ts_start = ts1, - ts_end = ts2, - a_id = a_id, - ar = ar, - stringsAsFactors = FALSE - ) - } - }) - - # merge list of dataframes into dataframe - r <- do.call(rbind, r) - - if (length(r) > 0) { - # extract info about time - ts <- data.frame( - ts_id = xml2::xml_attr( - xml2::xml_find_all(l, "TIME_ORDER/TIME_SLOT"), "TIME_SLOT_ID" - ), - time_value = as.numeric(xml2::xml_attr( - xml2::xml_find_all(l, "TIME_ORDER/TIME_SLOT"), "TIME_VALUE" - )) / 1000, - stringsAsFactors = FALSE - ) - - - # df with time markers - tm <- r[is.na(r$ar), c("ts_start", "ts_end", "a_id")] - # df without time markers - wtm <- r[!is.na(r$ar), c("a_id", "ar")] - - # create df with all time stamp - while (nrow(tm) < nrow(r)) { - df <- unique(merge( - x = wtm, - y = tm, - by.x = "ar", - by.y = "a_id" - )[, c("ts_start", "ts_end", "a_id")]) - tm <- unique(rbind(tm, df)) - } - - # result df with time stamps - r <- merge(r[, -c(6:7)], tm) - # merge with time stamp df ts_start column - r <- merge(r, ts, by.x = "ts_start", by.y = "ts_id") - names(r)[names(r) == "time_value"] <- "time_start" - # merge with time stamp df ts_end column - r <- merge(r, ts, by.x = "ts_end", by.y = "ts_id") - names(r)[names(r) == "time_value"] <- "time_end" - - # make sorting and remove some columns - r <- r[order(r$time_start, r$tier), -c(1:3, 9)] - r$source <- basename(file_name) - return(r) - } -} +#' ELAN's .eaf file to dataframe +#' +#' Convert .eaf file from ELAN to a dataframe. +#' +#' @author George Moroz +#' @author Kudrjashov Sergej +#' +#' @param file_name string with a filename or path to the .eaf file +#' @return a dataframe with columns: \code{tier}, \code{id}, \code{content}, +#' \code{tier_name}, \code{tier_type}, \code{tier_ref}, \code{event_local_id}, +#' \code{dependent_on}, \code{time_start}, \code{time_end}, \code{source}, +#' \code{MEDIA_URL}, \code{MIME_TYPE}, \code{RELATIVE_MEDIA_URL}. +#' +#' @examples +#' eaf_to_df(system.file("extdata", "test.eaf", package = "phonfieldwork")) +#' @export +#' @importFrom xml2 read_xml +#' @importFrom xml2 xml_find_all +#' @importFrom xml2 xml_attr +#' @importFrom xml2 xml_text +#' @importFrom xml2 xml_children +#' + +eaf_to_df <- function(file_name) { + + #extension check + if (sub(pattern = "^(.*\\.|[^.]+)(?=[^.]*)", replacement = "", file_name, perl = TRUE) != "eaf") { + stop("Wrong file extension. Assumed: .eaf; Got: " + sub(pattern = "^(.*\\.|[^.]+)(?=[^.]*)", replacement = ".", file_name, perl = TRUE)) + } + + # read file + l <- xml2::read_xml(file_name) + # extract tiers + t <- xml2::xml_find_all(l, "TIER") + # extract tiers + tier_names <- xml2::xml_attr(t, "TIER_ID") + # tier types + tier_types <- xml2::xml_attr(t, "LINGUISTIC_TYPE_REF") + #tier parent + tier_parent <- xml2::xml_attr(t, "PARENT_REF") + #tier_stereotype + tier_stereotype <- data.frame(xml2::xml_attr(xml2::xml_find_all(l, "LINGUISTIC_TYPE"), + "LINGUISTIC_TYPE_ID"), xml2::xml_attr(xml2::xml_find_all(l, "LINGUISTIC_TYPE"), + "CONSTRAINTS")) + colnames(tier_stereotype) <- c("tier_type", "stereotype") + + + # create list of dataframes + r <- lapply(seq_along(t), function(i) { + content <- xml2::xml_text(xml2::xml_find_all( + t[[i]], + "ANNOTATION/*/ANNOTATION_VALUE" + )) + ts1 <- xml2::xml_attr( + xml2::xml_children(xml2::xml_children(t[[i]])), + "TIME_SLOT_REF1" + ) + ts2 <- xml2::xml_attr( + xml2::xml_children(xml2::xml_children(t[[i]])), + "TIME_SLOT_REF2" + ) + a_id <- xml2::xml_attr( + xml2::xml_children(xml2::xml_children(t[[i]])), + "ANNOTATION_ID" + ) + ar <- xml2::xml_attr( + xml2::xml_children(xml2::xml_children(t[[i]])), + "ANNOTATION_REF" + ) + if (length(content) > 0) { + data.frame( + tier = i, + id = seq_along(content), + content = content, + tier_name = tier_names[i], + tier_type = tier_types[i], + tier_ref = tier_parent[i], + ts_start = ts1, + ts_end = ts2, + a_id = a_id, + ar = ar, + stringsAsFactors = FALSE + ) + } + }) + + # merge list of dataframes into dataframe + r <- do.call(rbind, r) + r$id_ <- 1:nrow(r) + r <- merge(r, tier_stereotype, by = 'tier_type') + r <- r[order(r$id_), ] + r <- subset(r, select = -c(id_)) + r <- subset(r, select = c(2, 3, 4, 5, 1, 11, 6, 7, 8, 9, 10)) + + if (length(r) > 0) { + # extract info about time + ts <- data.frame( + ts_id = xml2::xml_attr( + xml2::xml_find_all(l, "TIME_ORDER/TIME_SLOT"), "TIME_SLOT_ID" + ), + time_value = as.numeric(xml2::xml_attr( + xml2::xml_find_all(l, "TIME_ORDER/TIME_SLOT"), "TIME_VALUE" + )) / 1000, + stringsAsFactors = FALSE + ) + + + # df with time markers + tm <- r[is.na(r$ar), c("ts_start", "ts_end", "a_id")] + # df without time markers + wtm <- r[!is.na(r$ar), c("a_id", "ar")] + + # create df with all time stamp + while (nrow(tm) < nrow(r)) { + df <- unique(merge( + x = wtm, + y = tm, + by.x = "ar", + by.y = "a_id" + )[, c("ts_start", "ts_end", "a_id")]) + tm <- unique(rbind(tm, df)) + } + + # result df with time stamps + r <- merge(r[, -c(8:9)], tm) + # merge with time stamp df ts_start column + r <- merge(r, ts, by.x = "ts_start", by.y = "ts_id") + names(r)[names(r) == "time_value"] <- "time_start" + # merge with time stamp df ts_end column + r <- merge(r, ts, by.x = "ts_end", by.y = "ts_id") + names(r)[names(r) == "time_value"] <- "time_end" + + #fix issues with Symbolic Subdivision stereotype + r_sub <- r[which(r$stereotype == 'Symbolic_Subdivision'), ] + un_r_sub <- unique(r_sub$ar) + loc_res <- lapply(un_r_sub, function(i){ + df_loc <- r_sub[r_sub$ar == i, ] + time_start <- df_loc[1, ]$time_start + time_end <- df_loc[1, ]$time_end + round(seq(from = time_start, to = time_end, length.out = nrow(df_loc)+1), digits = 3) + }) + + + loc <- list() + r_sub[, 12] <- unlist(lapply(loc_res, function(i){ + loc <- append(loc, i[1:length(i)-1]) + })) + + r_sub[, 13] <- unlist(lapply(loc_res, function(i){ + loc <- append(loc, i[2:length(i)]) + })) + + r[row.names(r) %in% row.names(r_sub), ] <- r_sub + + #fix issues with Symbolic Association stereotype + r_sub <- r[which(r$stereotype == 'Symbolic_Association'), ] + + r_sub[, 12] <- unlist(lapply(row.names(r_sub), function(i) { + r[r$a_id == r_sub[i, ]$ar, ]$time_start + })) + + r_sub[, 13] <- unlist(lapply(row.names(r_sub), function(i) { + r[r$a_id == r_sub[i, ]$ar, ]$time_end + })) + + r[row.names(r) %in% row.names(r_sub), ] <- r_sub + + #fix issues with Time Subdivision stereotype + + r_sub <- r[which(r$stereotype == 'Time_Subdivision'), ] + r_sub <- r_sub[order(r_sub$tier_name, decreasing = FALSE), ] + if (sum(is.na(r_sub$time_start)) + sum(is.na(r_sub$time_end)) != 0) { + + r_loc_sub <- r_sub[which(is.na(r_sub$time_start) | is.na(r_sub$time_end)), ] + na_end <- r_loc_sub[which(!is.na(r_loc_sub$time_end)), ] + na_start <- r_loc_sub[which(!is.na(r_loc_sub$time_start)), ] + loc_res <- lapply(seq(length.out = nrow(na_start)), function(i) { + + id_start <- na_start[i, 5] + id_end <- na_end[i, 5] + time_start <- na_start[i, 12] + time_end <- na_end[i, 13] + round(seq(from = time_start, to = time_end, length.out = id_end-id_start+2), digits = 3) + + }) + + loc <- list() + r_loc_sub[, 12] <- unlist(lapply(loc_res, function(i){ + loc <- append(loc, i[1:length(i)-1]) + })) + + r_loc_sub[, 13] <- unlist(lapply(loc_res, function(i){ + loc <- append(loc, i[2:length(i)]) + })) + + r_sub[row.names(r_sub) %in% row.names(r_loc_sub), ] <- r_loc_sub + + r[row.names(r) %in% row.names(r_sub), ] <- r_sub + + } + + # make sorting and remove some columns + r <- r[order(r$time_start, r$tier), -c(1:2)] + names(r)[names(r) == 'ar'] <- 'dependent_on' + names(r)[names(r) == 'a_id'] <- 'event_local_id' + r <- r[, c(2,3,4,5,6,7,8,1,9,10,11)] + + #connected file + r$MEDIA_URL <- xml2::xml_attr(xml2::xml_children(xml2::xml_find_all(l, 'HEADER'))[1], 'MEDIA_URL') + r$MIME_TYPE <- xml2::xml_attr(xml2::xml_children(xml2::xml_find_all(l, 'HEADER'))[1], 'MIME_TYPE') + r$RELATIVE_MEDIA_URL <- xml2::xml_attr(xml2::xml_children(xml2::xml_find_all(l, 'HEADER'))[1], 'RELATIVE_MEDIA_URL') + + r$source <- basename(file_name) + return(r) + } +} + diff --git a/inst/extdata/test.eaf b/inst/extdata/test.eaf index 5aa18e5..d6bcd71 100644 --- a/inst/extdata/test.eaf +++ b/inst/extdata/test.eaf @@ -1,8 +1,10 @@ - +
urn:nl-mpi-tools-elan-eaf:f86eb0b2-4e24-474f-bd43-1221c73dd19f - 12 + 14
@@ -11,92 +13,126 @@ - + - + - + - + - - + + + + + + - - + + - + t - + e - + s - + t - - + + - - + + - + C - + V - + C - + C - - + + - - - + + + + this is just a basic sentence nothing special + + + + + one more sentence + + + + + + - +
diff --git a/inst/extdata/test.pfsx b/inst/extdata/test.pfsx new file mode 100644 index 0000000..036db9e --- /dev/null +++ b/inst/extdata/test.pfsx @@ -0,0 +1,107 @@ + + + + Время начала + Время окончания + Слой + Инициалы + Комментарий + Ветка + Отправитель + Получатель + Дата создания + Дата изменения + + + 1920,1040 + + + 0,0 + + + 84.0 + + + 6 + + + mpi.eudico.client.annotator.viewer.TimeLineViewer + + + 11690 + + + intervals + + + 8595 + + + 76 + + + 13011 + + + 26461 + + + + 160,90,90 + + + 0,90,90 + + + 90,90,160 + + + + intervals + empty_intervals + sentence + + + 1 + + + 1 + + + + 75 + + + 75 + + + 75 + + + 75 + + + 75 + + + 75 + + + 0 + + + 0 + + + 75 + + + 75 + + + + Отправитель + Получатель + + diff --git a/man/create_viewer.Rd b/man/create_viewer.Rd index 9fffa27..e8a3de0 100644 --- a/man/create_viewer.Rd +++ b/man/create_viewer.Rd @@ -6,7 +6,7 @@ \usage{ create_viewer( audio_dir, - picture_dir, + picture_dir = NULL, table, captions = NULL, sorting_columns = NULL, diff --git a/man/df_to_eaf.Rd b/man/df_to_eaf.Rd new file mode 100644 index 0000000..44aeafc --- /dev/null +++ b/man/df_to_eaf.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/df_to_eaf.R +\name{df_to_eaf} +\alias{df_to_eaf} +\title{Dataframe to .eaf} +\usage{ +df_to_eaf(df, output_file, output_dir = "", ref_file = "", mime_type = "") +} +\arguments{ +\item{df}{an R dataframe object that contains columns named 'tier', 'id', 'tier_name', +'content', 'time_start', 'time_end' and preferably also 'tier_type', 'stereotype', +'tier_ref', 'event_local_id', 'dependent_on' that are specific for eaf file} + +\item{output_file}{the name of the result .eaf file} + +\item{output_dir}{the output directory for the rendered file (defalut is used if not spectified)} + +\item{ref_file}{a filepath for connected media file (not obligatory) - if specified, overwrites existing parameters} + +\item{mime_type}{a MIME type of connected media file (not obligatory) - if specified, overwrites existing parameters} +} +\value{ +.xml file +} +\description{ +Convert a dataframe to Elan file .exb +} +\examples{ + +df <- eaf_to_df(system.file("extdata", "test.eaf", package = "phonfieldwork")) + +df_to_eaf(df = df, + output_file = 'test.eaf', + ref_file = 'test.wav') + +} +\author{ +Sergej Kudrjashov +} diff --git a/man/eaf_to_df.Rd b/man/eaf_to_df.Rd index 7e72421..d010801 100644 --- a/man/eaf_to_df.Rd +++ b/man/eaf_to_df.Rd @@ -1,25 +1,28 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/eaf_to_df.R -\name{eaf_to_df} -\alias{eaf_to_df} -\title{ELAN's .eaf file to dataframe} -\usage{ -eaf_to_df(file_name) -} -\arguments{ -\item{file_name}{string with a filename or path to the .eaf file} -} -\value{ -a dataframe with columns: \code{tier}, \code{id}, \code{content}, -\code{tier_name}, \code{tier_type}, \code{time_start}, \code{time_end}, -\code{source}). -} -\description{ -Convert .eaf file from ELAN to a dataframe. -} -\examples{ -eaf_to_df(system.file("extdata", "test.eaf", package = "phonfieldwork")) -} -\author{ -George Moroz -} +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/eaf_to_df.R +\name{eaf_to_df} +\alias{eaf_to_df} +\title{ELAN's .eaf file to dataframe} +\usage{ +eaf_to_df(file_name) +} +\arguments{ +\item{file_name}{string with a filename or path to the .eaf file} +} +\value{ +a dataframe with columns: \code{tier}, \code{id}, \code{content}, +\code{tier_name}, \code{tier_type}, \code{tier_ref}, \code{event_local_id}, + \code{dependent_on}, \code{time_start}, \code{time_end}, \code{source}, + \code{MEDIA_URL}, \code{MIME_TYPE}, \code{RELATIVE_MEDIA_URL}. +} +\description{ +Convert .eaf file from ELAN to a dataframe. +} +\examples{ +eaf_to_df(system.file("extdata", "test.eaf", package = "phonfieldwork")) +} +\author{ +George Moroz + +Kudrjashov Sergej +}